diff --git a/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/18881f8b-b06e-4317-b697-6eadb975077c.json b/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json
similarity index 75%
rename from data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/18881f8b-b06e-4317-b697-6eadb975077c.json
rename to data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json
index 42f19b810..8176fa91a 100644
--- a/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/18881f8b-b06e-4317-b697-6eadb975077c.json
+++ b/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/allenai_olmo-2-0325-32b-instruct/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/allenai_olmo-2-0325-32b-instruct/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.475,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,16 +100,25 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -141,16 +162,25 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,13 +224,22 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -244,14 +283,23 @@
         }
       },
       "generation_config": {
-        "subset": "v2",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "v2",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -295,7 +343,9 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     }
   ]
diff --git a/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/97db1a8d-b7d8-4481-82fb-dc0c6396edac.json b/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json
similarity index 75%
rename from data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/97db1a8d-b7d8-4481-82fb-dc0c6396edac.json
rename to data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json
index c596a8093..4d2b264af 100644
--- a/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/97db1a8d-b7d8-4481-82fb-dc0c6396edac.json
+++ b/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-13b-instruct/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-13b-instruct/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.44,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,16 +100,25 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -141,16 +162,25 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,13 +224,22 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -244,14 +283,23 @@
         }
       },
       "generation_config": {
-        "subset": "v2",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "v2",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -295,7 +343,9 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     }
   ]
diff --git a/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/8d29f447-01d8-4fae-87d5-b4386ce5239a.json b/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json
similarity index 75%
rename from data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/8d29f447-01d8-4fae-87d5-b4386ce5239a.json
rename to data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json
index da8bb1b91..39fbc0d1c 100644
--- a/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/8d29f447-01d8-4fae-87d5-b4386ce5239a.json
+++ b/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-7b-instruct/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-7b-instruct/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.405,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,16 +100,25 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -141,16 +162,25 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,13 +224,22 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -244,14 +283,23 @@
         }
       },
       "generation_config": {
-        "subset": "v2",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "v2",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -295,7 +343,9 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     }
   ]
diff --git a/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/53090373-ea82-4b63-83fd-f1d48f0400cd.json b/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json
similarity index 75%
rename from data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/53090373-ea82-4b63-83fd-f1d48f0400cd.json
rename to data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json
index cb4638d3d..99d31c069 100644
--- a/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/53090373-ea82-4b63-83fd-f1d48f0400cd.json
+++ b/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/allenai_olmoe-1b-7b-0125-instruct/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/allenai_olmoe-1b-7b-0125-instruct/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.332,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,16 +100,25 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -141,16 +162,25 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,13 +224,22 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -244,14 +283,23 @@
         }
       },
       "generation_config": {
-        "subset": "v2",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "v2",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -295,7 +343,9 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     }
   ]
diff --git a/data/helm_capabilities/amazon/nova-lite-v1:0/6665062e-03c1-4758-8858-1184405a3538.json b/data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json
similarity index 75%
rename from data/helm_capabilities/amazon/nova-lite-v1:0/6665062e-03c1-4758-8858-1184405a3538.json
rename to data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json
index 0670c6db3..c786f36c7 100644
--- a/data/helm_capabilities/amazon/nova-lite-v1:0/6665062e-03c1-4758-8858-1184405a3538.json
+++ b/data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/amazon_nova-lite-v1:0/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/amazon_nova-lite-v1:0/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.551,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/amazon/nova-micro-v1:0/5f53ac6c-1d10-4f07-acc3-d622c5360168.json b/data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json
similarity index 75%
rename from data/helm_capabilities/amazon/nova-micro-v1:0/5f53ac6c-1d10-4f07-acc3-d622c5360168.json
rename to data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json
index 2c6f0abd0..6219cdf47 100644
--- a/data/helm_capabilities/amazon/nova-micro-v1:0/5f53ac6c-1d10-4f07-acc3-d622c5360168.json
+++ b/data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/amazon_nova-micro-v1:0/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/amazon_nova-micro-v1:0/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.522,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/amazon/nova-premier-v1:0/bcfec13c-8645-4ad2-a746-67e951e07aa9.json b/data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json
similarity index 76%
rename from data/helm_capabilities/amazon/nova-premier-v1:0/bcfec13c-8645-4ad2-a746-67e951e07aa9.json
rename to data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json
index 3a64b94b2..d9f1bd857 100644
--- a/data/helm_capabilities/amazon/nova-premier-v1:0/bcfec13c-8645-4ad2-a746-67e951e07aa9.json
+++ b/data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/amazon_nova-premier-v1:0/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/amazon_nova-premier-v1:0/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.637,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/amazon/nova-pro-v1:0/b8fb264c-9d58-4a55-8b48-c3f2e116828d.json b/data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json
similarity index 75%
rename from data/helm_capabilities/amazon/nova-pro-v1:0/b8fb264c-9d58-4a55-8b48-c3f2e116828d.json
rename to data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json
index bbdb8512b..658945ff5 100644
--- a/data/helm_capabilities/amazon/nova-pro-v1:0/b8fb264c-9d58-4a55-8b48-c3f2e116828d.json
+++ b/data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/amazon_nova-pro-v1:0/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/amazon_nova-pro-v1:0/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.591,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/568969ac-4b9a-42b0-8374-2b28dde30a3c.json b/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json
similarity index 76%
rename from data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/568969ac-4b9a-42b0-8374-2b28dde30a3c.json
rename to data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json
index 44b7ab97a..d63e271d1 100644
--- a/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/568969ac-4b9a-42b0-8374-2b28dde30a3c.json
+++ b/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-3-5-haiku-20241022/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-3-5-haiku-20241022/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.5 Haiku (20241022)",
+    "name": "Claude 3.5 Haiku 20241022",
     "id": "anthropic/claude-3-5-haiku-20241022",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.549,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c6b92f00-6335-463d-87db-817ff85f36c8.json b/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json
similarity index 76%
rename from data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c6b92f00-6335-463d-87db-817ff85f36c8.json
rename to data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json
index b8e94bdb5..c53a3aa66 100644
--- a/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c6b92f00-6335-463d-87db-817ff85f36c8.json
+++ b/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-3-5-sonnet-20241022/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-3-5-sonnet-20241022/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.5 Sonnet (20241022)",
+    "name": "Claude 3.5 Sonnet 20241022",
     "id": "anthropic/claude-3-5-sonnet-20241022",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.653,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/460fdbd2-a164-4af4-95ff-db66e381ca0c.json b/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json
similarity index 76%
rename from data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/460fdbd2-a164-4af4-95ff-db66e381ca0c.json
rename to data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json
index a41bf85dc..1f5c52f66 100644
--- a/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/460fdbd2-a164-4af4-95ff-db66e381ca0c.json
+++ b/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-3-7-sonnet-20250219/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-3-7-sonnet-20250219/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.7 Sonnet (20250219)",
+    "name": "Claude 3.7 Sonnet 20250219",
     "id": "anthropic/claude-3-7-sonnet-20250219",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.674,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json b/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json
new file mode 100644
index 000000000..da15e55a7
--- /dev/null
+++ b/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json
@@ -0,0 +1,345 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-haiku-4-5-20251001/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
+  "source_metadata": {
+    "source_name": "helm_capabilities",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "Claude 4.5 Haiku 20251001",
+    "id": "anthropic/claude-haiku-4-5-20251001",
+    "developer": "anthropic",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "The mean of the scores from all columns.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.717,
+        "details": {
+          "tab": "Accuracy",
+          "Mean score - Efficiency": {
+            "description": null,
+            "tab": "Efficiency",
+            "score": 7.381503096938465
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "COT correct on MMLU-Pro",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.777,
+        "details": {
+          "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)",
+          "tab": "Accuracy",
+          "MMLU-Pro - Observed inference time (s)": {
+            "description": "min=3.701, mean=3.701, max=3.701, sum=3.701 (1)",
+            "tab": "Efficiency",
+            "score": 3.7008020806312563
+          },
+          "MMLU-Pro - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "MMLU-Pro - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - # prompt tokens": {
+            "description": "min=252.461, mean=252.461, max=252.461, sum=252.461 (1)",
+            "tab": "General information",
+            "score": 252.461
+          },
+          "MMLU-Pro - # output tokens": {
+            "description": "min=374.129, mean=374.129, max=374.129, sum=374.129 (1)",
+            "tab": "General information",
+            "score": 374.129
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "COT correct on GPQA",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.605,
+        "details": {
+          "description": "min=0.605, mean=0.605, max=0.605, sum=0.605 (1)",
+          "tab": "Accuracy",
+          "GPQA - Observed inference time (s)": {
+            "description": "min=5.102, mean=5.102, max=5.102, sum=5.102 (1)",
+            "tab": "Efficiency",
+            "score": 5.102193982611857
+          },
+          "GPQA - # eval": {
+            "description": "min=446, mean=446, max=446, sum=446 (1)",
+            "tab": "General information",
+            "score": 446.0
+          },
+          "GPQA - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - # prompt tokens": {
+            "description": "min=272.738, mean=272.738, max=272.738, sum=272.738 (1)",
+            "tab": "General information",
+            "score": 272.73766816143495
+          },
+          "GPQA - # output tokens": {
+            "description": "min=524.525, mean=524.525, max=524.525, sum=524.525 (1)",
+            "tab": "General information",
+            "score": 524.5246636771301
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "IFEval Strict Acc on IFEval",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.801,
+        "details": {
+          "description": "min=0.801, mean=0.801, max=0.801, sum=0.801 (1)",
+          "tab": "Accuracy",
+          "IFEval - Observed inference time (s)": {
+            "description": "min=4.355, mean=4.355, max=4.355, sum=4.355 (1)",
+            "tab": "Efficiency",
+            "score": 4.355410516372229
+          },
+          "IFEval - # eval": {
+            "description": "min=541, mean=541, max=541, sum=541 (1)",
+            "tab": "General information",
+            "score": 541.0
+          },
+          "IFEval - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - # prompt tokens": {
+            "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)",
+            "tab": "General information",
+            "score": 47.15896487985213
+          },
+          "IFEval - # output tokens": {
+            "description": "min=390.416, mean=390.416, max=390.416, sum=390.416 (1)",
+            "tab": "General information",
+            "score": 390.4158964879852
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "WB Score on WildBench",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.839,
+        "details": {
+          "description": "min=0.839, mean=0.839, max=0.839, sum=0.839 (1)",
+          "tab": "Accuracy",
+          "WildBench - Observed inference time (s)": {
+            "description": "min=16.317, mean=16.317, max=16.317, sum=16.317 (1)",
+            "tab": "Efficiency",
+            "score": 16.317131044387818
+          },
+          "WildBench - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "WildBench - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # prompt tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # output tokens": {
+            "description": "min=1835.337, mean=1835.337, max=1835.337, sum=1835.337 (1)",
+            "tab": "General information",
+            "score": 1835.337
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "v2"
+        }
+      }
+    },
+    {
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Acc on Omni-MATH",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.561,
+        "details": {
+          "description": "min=0.561, mean=0.561, max=0.561, sum=0.561 (1)",
+          "tab": "Accuracy",
+          "Omni-MATH - Observed inference time (s)": {
+            "description": "min=7.432, mean=7.432, max=7.432, sum=7.432 (1)",
+            "tab": "Efficiency",
+            "score": 7.431977860689163
+          },
+          "Omni-MATH - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "Omni-MATH - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - # prompt tokens": {
+            "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)",
+            "tab": "General information",
+            "score": 110.563
+          },
+          "Omni-MATH - # output tokens": {
+            "description": "min=937.799, mean=937.799, max=937.799, sum=937.799 (1)",
+            "tab": "General information",
+            "score": 937.799
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/cb21169b-04ff-47d1-92dd-5b5f2e09b863.json b/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json
similarity index 76%
rename from data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/cb21169b-04ff-47d1-92dd-5b5f2e09b863.json
rename to data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json
index a9349e9cb..c554c6a65 100644
--- a/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/cb21169b-04ff-47d1-92dd-5b5f2e09b863.json
+++ b/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514-thinking-10k/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514-thinking-10k/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 4 Opus (20250514, extended thinking)",
+    "name": "Claude 4 Opus 20250514, extended thinking",
     "id": "anthropic/claude-opus-4-20250514-thinking-10k",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.78,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-opus-4-20250514/2168d830-ad6b-4aee-94f0-7ec8fd403a49.json b/data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json
similarity index 76%
rename from data/helm_capabilities/anthropic/claude-opus-4-20250514/2168d830-ad6b-4aee-94f0-7ec8fd403a49.json
rename to data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json
index c82ca8963..240e9ebf4 100644
--- a/data/helm_capabilities/anthropic/claude-opus-4-20250514/2168d830-ad6b-4aee-94f0-7ec8fd403a49.json
+++ b/data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 4 Opus (20250514)",
+    "name": "Claude 4 Opus 20250514",
     "id": "anthropic/claude-opus-4-20250514",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.757,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/a5242cb1-b0fb-464f-ba7c-2d92deea03d3.json b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json
similarity index 76%
rename from data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/a5242cb1-b0fb-464f-ba7c-2d92deea03d3.json
rename to data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json
index 6bf01f358..ecc6c0f0a 100644
--- a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/a5242cb1-b0fb-464f-ba7c-2d92deea03d3.json
+++ b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514-thinking-10k/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514-thinking-10k/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 4 Sonnet (20250514, extended thinking)",
+    "name": "Claude 4 Sonnet 20250514, extended thinking",
     "id": "anthropic/claude-sonnet-4-20250514-thinking-10k",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.766,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/629d5de7-25ed-4088-aca6-7fb53719f4a4.json b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json
similarity index 76%
rename from data/helm_capabilities/anthropic/claude-sonnet-4-20250514/629d5de7-25ed-4088-aca6-7fb53719f4a4.json
rename to data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json
index af4facce4..b4413ccdd 100644
--- a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/629d5de7-25ed-4088-aca6-7fb53719f4a4.json
+++ b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 4 Sonnet (20250514)",
+    "name": "Claude 4 Sonnet 20250514",
     "id": "anthropic/claude-sonnet-4-20250514",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.733,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json b/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json
new file mode 100644
index 000000000..e0991c0d9
--- /dev/null
+++ b/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json
@@ -0,0 +1,345 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-5-20250929/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
+  "source_metadata": {
+    "source_name": "helm_capabilities",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "Claude 4.5 Sonnet 20250929",
+    "id": "anthropic/claude-sonnet-4-5-20250929",
+    "developer": "anthropic",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "The mean of the scores from all columns.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.762,
+        "details": {
+          "tab": "Accuracy",
+          "Mean score - Efficiency": {
+            "description": null,
+            "tab": "Efficiency",
+            "score": 17.536448448412127
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "COT correct on MMLU-Pro",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.869,
+        "details": {
+          "description": "min=0.869, mean=0.869, max=0.869, sum=0.869 (1)",
+          "tab": "Accuracy",
+          "MMLU-Pro - Observed inference time (s)": {
+            "description": "min=9.03, mean=9.03, max=9.03, sum=9.03 (1)",
+            "tab": "Efficiency",
+            "score": 9.029817205530268
+          },
+          "MMLU-Pro - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "MMLU-Pro - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - # prompt tokens": {
+            "description": "min=252.461, mean=252.461, max=252.461, sum=252.461 (1)",
+            "tab": "General information",
+            "score": 252.461
+          },
+          "MMLU-Pro - # output tokens": {
+            "description": "min=392.292, mean=392.292, max=392.292, sum=392.292 (1)",
+            "tab": "General information",
+            "score": 392.292
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "COT correct on GPQA",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.686,
+        "details": {
+          "description": "min=0.686, mean=0.686, max=0.686, sum=0.686 (1)",
+          "tab": "Accuracy",
+          "GPQA - Observed inference time (s)": {
+            "description": "min=12.414, mean=12.414, max=12.414, sum=12.414 (1)",
+            "tab": "Efficiency",
+            "score": 12.414452127318263
+          },
+          "GPQA - # eval": {
+            "description": "min=446, mean=446, max=446, sum=446 (1)",
+            "tab": "General information",
+            "score": 446.0
+          },
+          "GPQA - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - # prompt tokens": {
+            "description": "min=272.738, mean=272.738, max=272.738, sum=272.738 (1)",
+            "tab": "General information",
+            "score": 272.73766816143495
+          },
+          "GPQA - # output tokens": {
+            "description": "min=544.215, mean=544.215, max=544.215, sum=544.215 (1)",
+            "tab": "General information",
+            "score": 544.2152466367713
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "IFEval Strict Acc on IFEval",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.85,
+        "details": {
+          "description": "min=0.85, mean=0.85, max=0.85, sum=0.85 (1)",
+          "tab": "Accuracy",
+          "IFEval - Observed inference time (s)": {
+            "description": "min=10.904, mean=10.904, max=10.904, sum=10.904 (1)",
+            "tab": "Efficiency",
+            "score": 10.90394415211986
+          },
+          "IFEval - # eval": {
+            "description": "min=541, mean=541, max=541, sum=541 (1)",
+            "tab": "General information",
+            "score": 541.0
+          },
+          "IFEval - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - # prompt tokens": {
+            "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)",
+            "tab": "General information",
+            "score": 47.15896487985213
+          },
+          "IFEval - # output tokens": {
+            "description": "min=414.632, mean=414.632, max=414.632, sum=414.632 (1)",
+            "tab": "General information",
+            "score": 414.63216266173754
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "WB Score on WildBench",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.854,
+        "details": {
+          "description": "min=0.854, mean=0.854, max=0.854, sum=0.854 (1)",
+          "tab": "Accuracy",
+          "WildBench - Observed inference time (s)": {
+            "description": "min=38.544, mean=38.544, max=38.544, sum=38.544 (1)",
+            "tab": "Efficiency",
+            "score": 38.54364204096484
+          },
+          "WildBench - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "WildBench - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # prompt tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # output tokens": {
+            "description": "min=1804.604, mean=1804.604, max=1804.604, sum=1804.604 (1)",
+            "tab": "General information",
+            "score": 1804.604
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "v2"
+        }
+      }
+    },
+    {
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Acc on Omni-MATH",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.553,
+        "details": {
+          "description": "min=0.553, mean=0.553, max=0.553, sum=0.553 (1)",
+          "tab": "Accuracy",
+          "Omni-MATH - Observed inference time (s)": {
+            "description": "min=16.79, mean=16.79, max=16.79, sum=16.79 (1)",
+            "tab": "Efficiency",
+            "score": 16.790386716127397
+          },
+          "Omni-MATH - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "Omni-MATH - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - # prompt tokens": {
+            "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)",
+            "tab": "General information",
+            "score": 110.563
+          },
+          "Omni-MATH - # output tokens": {
+            "description": "min=892.774, mean=892.774, max=892.774, sum=892.774 (1)",
+            "tab": "General information",
+            "score": 892.774
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/fcc025bc-98aa-44ef-b64d-a45a8e4daaa8.json b/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json
similarity index 76%
rename from data/helm_capabilities/deepseek-ai/deepseek-r1-0528/fcc025bc-98aa-44ef-b64d-a45a8e4daaa8.json
rename to data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json
index 0b36b4b41..682cc94cc 100644
--- a/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/fcc025bc-98aa-44ef-b64d-a45a8e4daaa8.json
+++ b/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-r1-0528/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-r1-0528/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.699,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/deepseek-ai/deepseek-v3/d031935b-2b54-4940-a852-dad1f10fc396.json b/data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json
similarity index 76%
rename from data/helm_capabilities/deepseek-ai/deepseek-v3/d031935b-2b54-4940-a852-dad1f10fc396.json
rename to data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json
index 3502a2f83..3b034de70 100644
--- a/data/helm_capabilities/deepseek-ai/deepseek-v3/d031935b-2b54-4940-a852-dad1f10fc396.json
+++ b/data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-v3/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-v3/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.665,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-1.5-flash-002/b79010aa-d441-4850-b656-52ce6587dab8.json b/data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json
similarity index 75%
rename from data/helm_capabilities/google/gemini-1.5-flash-002/b79010aa-d441-4850-b656-52ce6587dab8.json
rename to data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json
index 9cecc3e6e..7d4281de4 100644
--- a/data/helm_capabilities/google/gemini-1.5-flash-002/b79010aa-d441-4850-b656-52ce6587dab8.json
+++ b/data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/google_gemini-1.5-flash-002/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/google_gemini-1.5-flash-002/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Flash (002)",
+    "name": "Gemini 1.5 Flash 002",
     "id": "google/gemini-1.5-flash-002",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.609,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-1.5-pro-002/dde5a36d-f14b-482d-86db-74bdb3771e38.json b/data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json
similarity index 75%
rename from data/helm_capabilities/google/gemini-1.5-pro-002/dde5a36d-f14b-482d-86db-74bdb3771e38.json
rename to data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json
index c41c3cf10..3c438fd59 100644
--- a/data/helm_capabilities/google/gemini-1.5-pro-002/dde5a36d-f14b-482d-86db-74bdb3771e38.json
+++ b/data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/google_gemini-1.5-pro-002/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/google_gemini-1.5-pro-002/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Pro (002)",
+    "name": "Gemini 1.5 Pro 002",
     "id": "google/gemini-1.5-pro-002",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.657,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-2.0-flash-001/981ba423-a1d2-4577-9f61-9c4b8b430b58.json b/data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json
similarity index 76%
rename from data/helm_capabilities/google/gemini-2.0-flash-001/981ba423-a1d2-4577-9f61-9c4b8b430b58.json
rename to data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json
index 963d02bef..7f589b967 100644
--- a/data/helm_capabilities/google/gemini-2.0-flash-001/981ba423-a1d2-4577-9f61-9c4b8b430b58.json
+++ b/data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-001/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-001/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.679,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/56ddcce9-fc1c-476f-96c8-65a7d732c95b.json b/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json
similarity index 75%
rename from data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/56ddcce9-fc1c-476f-96c8-65a7d732c95b.json
rename to data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json
index 87e886284..0376cdf40 100644
--- a/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/56ddcce9-fc1c-476f-96c8-65a7d732c95b.json
+++ b/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-lite-preview-02-05/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-lite-preview-02-05/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 2.0 Flash Lite (02-05 preview)",
+    "name": "Gemini 2.0 Flash Lite 02-05 preview",
     "id": "google/gemini-2.0-flash-lite-preview-02-05",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.642,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-2.5-flash-lite/22da4909-8b3b-49f3-940f-8764509725f8.json b/data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json
similarity index 76%
rename from data/helm_capabilities/google/gemini-2.5-flash-lite/22da4909-8b3b-49f3-940f-8764509725f8.json
rename to data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json
index a5294b486..600681fbb 100644
--- a/data/helm_capabilities/google/gemini-2.5-flash-lite/22da4909-8b3b-49f3-940f-8764509725f8.json
+++ b/data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-lite/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-lite/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.591,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/a6b3d596-d204-4cb7-a3e4-4e717537b76a.json b/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json
similarity index 75%
rename from data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/a6b3d596-d204-4cb7-a3e4-4e717537b76a.json
rename to data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json
index d0e1ed757..221dc7a91 100644
--- a/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/a6b3d596-d204-4cb7-a3e4-4e717537b76a.json
+++ b/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-preview-04-17/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-preview-04-17/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 2.5 Flash (04-17 preview)",
+    "name": "Gemini 2.5 Flash 04-17 preview",
     "id": "google/gemini-2.5-flash-preview-04-17",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.626,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/eaa18be0-1195-4344-9673-efa8c555456d.json b/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json
similarity index 75%
rename from data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/eaa18be0-1195-4344-9673-efa8c555456d.json
rename to data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json
index f1093c814..355cd3bc1 100644
--- a/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/eaa18be0-1195-4344-9673-efa8c555456d.json
+++ b/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.5-pro-preview-03-25/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/google_gemini-2.5-pro-preview-03-25/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 2.5 Pro (03-25 preview)",
+    "name": "Gemini 2.5 Pro 03-25 preview",
     "id": "google/gemini-2.5-pro-preview-03-25",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.745,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json b/data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json
new file mode 100644
index 000000000..d3ecb3ebb
--- /dev/null
+++ b/data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json
@@ -0,0 +1,345 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/google_gemini-3-pro-preview/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
+  "source_metadata": {
+    "source_name": "helm_capabilities",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "Gemini 3 Pro Preview",
+    "id": "google/gemini-3-pro-preview",
+    "developer": "google",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "The mean of the scores from all columns.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.799,
+        "details": {
+          "tab": "Accuracy",
+          "Mean score - Efficiency": {
+            "description": null,
+            "tab": "Efficiency",
+            "score": 50.969324812798575
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "COT correct on MMLU-Pro",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.903,
+        "details": {
+          "description": "min=0.903, mean=0.903, max=0.903, sum=0.903 (1)",
+          "tab": "Accuracy",
+          "MMLU-Pro - Observed inference time (s)": {
+            "description": "min=34.903, mean=34.903, max=34.903, sum=34.903 (1)",
+            "tab": "Efficiency",
+            "score": 34.903078527212145
+          },
+          "MMLU-Pro - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "MMLU-Pro - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - # prompt tokens": {
+            "description": "min=263.673, mean=263.673, max=263.673, sum=263.673 (1)",
+            "tab": "General information",
+            "score": 263.673
+          },
+          "MMLU-Pro - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "COT correct on GPQA",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.803,
+        "details": {
+          "description": "min=0.803, mean=0.803, max=0.803, sum=0.803 (1)",
+          "tab": "Accuracy",
+          "GPQA - Observed inference time (s)": {
+            "description": "min=69.164, mean=69.164, max=69.164, sum=69.164 (1)",
+            "tab": "Efficiency",
+            "score": 69.16407415364355
+          },
+          "GPQA - # eval": {
+            "description": "min=446, mean=446, max=446, sum=446 (1)",
+            "tab": "General information",
+            "score": 446.0
+          },
+          "GPQA - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - # prompt tokens": {
+            "description": "min=273.735, mean=273.735, max=273.735, sum=273.735 (1)",
+            "tab": "General information",
+            "score": 273.7354260089686
+          },
+          "GPQA - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "IFEval Strict Acc on IFEval",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.876,
+        "details": {
+          "description": "min=0.876, mean=0.876, max=0.876, sum=0.876 (1)",
+          "tab": "Accuracy",
+          "IFEval - Observed inference time (s)": {
+            "description": "min=18.201, mean=18.201, max=18.201, sum=18.201 (1)",
+            "tab": "Efficiency",
+            "score": 18.200553727458452
+          },
+          "IFEval - # eval": {
+            "description": "min=541, mean=541, max=541, sum=541 (1)",
+            "tab": "General information",
+            "score": 541.0
+          },
+          "IFEval - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - # prompt tokens": {
+            "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)",
+            "tab": "General information",
+            "score": 47.33086876155268
+          },
+          "IFEval - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "WB Score on WildBench",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.859,
+        "details": {
+          "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)",
+          "tab": "Accuracy",
+          "WildBench - Observed inference time (s)": {
+            "description": "min=37.094, mean=37.094, max=37.094, sum=37.094 (1)",
+            "tab": "Efficiency",
+            "score": 37.09404513451669
+          },
+          "WildBench - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "WildBench - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # prompt tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "v2"
+        }
+      }
+    },
+    {
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Acc on Omni-MATH",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.555,
+        "details": {
+          "description": "min=0.555, mean=0.555, max=0.555, sum=0.555 (1)",
+          "tab": "Accuracy",
+          "Omni-MATH - Observed inference time (s)": {
+            "description": "min=95.485, mean=95.485, max=95.485, sum=95.485 (1)",
+            "tab": "Efficiency",
+            "score": 95.48487252116203
+          },
+          "Omni-MATH - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "Omni-MATH - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - # prompt tokens": {
+            "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)",
+            "tab": "General information",
+            "score": 111.956
+          },
+          "Omni-MATH - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_capabilities/ibm/granite-3.3-8b-instruct/0ae30d3c-098c-434f-985b-58e8179148a6.json b/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json
similarity index 76%
rename from data/helm_capabilities/ibm/granite-3.3-8b-instruct/0ae30d3c-098c-434f-985b-58e8179148a6.json
rename to data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json
index 42be38419..869902b9d 100644
--- a/data/helm_capabilities/ibm/granite-3.3-8b-instruct/0ae30d3c-098c-434f-985b-58e8179148a6.json
+++ b/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/ibm_granite-3.3-8b-instruct/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/ibm_granite-3.3-8b-instruct/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.463,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json b/data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json
new file mode 100644
index 000000000..03bc0f0f8
--- /dev/null
+++ b/data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json
@@ -0,0 +1,345 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/ibm_granite-4.0-h-small/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
+  "source_metadata": {
+    "source_name": "helm_capabilities",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "IBM Granite 4.0 Small",
+    "id": "ibm/granite-4.0-h-small",
+    "developer": "ibm",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "The mean of the scores from all columns.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.575,
+        "details": {
+          "tab": "Accuracy",
+          "Mean score - Efficiency": {
+            "description": null,
+            "tab": "Efficiency",
+            "score": 21.31162992088884
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "COT correct on MMLU-Pro",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.569,
+        "details": {
+          "description": "min=0.569, mean=0.569, max=0.569, sum=0.569 (1)",
+          "tab": "Accuracy",
+          "MMLU-Pro - Observed inference time (s)": {
+            "description": "min=12.071, mean=12.071, max=12.071, sum=12.071 (1)",
+            "tab": "Efficiency",
+            "score": 12.070928404092788
+          },
+          "MMLU-Pro - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "MMLU-Pro - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - # prompt tokens": {
+            "description": "min=288.391, mean=288.391, max=288.391, sum=288.391 (1)",
+            "tab": "General information",
+            "score": 288.391
+          },
+          "MMLU-Pro - # output tokens": {
+            "description": "min=372.93, mean=372.93, max=372.93, sum=372.93 (1)",
+            "tab": "General information",
+            "score": 372.93
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "COT correct on GPQA",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.383,
+        "details": {
+          "description": "min=0.383, mean=0.383, max=0.383, sum=0.383 (1)",
+          "tab": "Accuracy",
+          "GPQA - Observed inference time (s)": {
+            "description": "min=17.606, mean=17.606, max=17.606, sum=17.606 (1)",
+            "tab": "Efficiency",
+            "score": 17.606201725690354
+          },
+          "GPQA - # eval": {
+            "description": "min=446, mean=446, max=446, sum=446 (1)",
+            "tab": "General information",
+            "score": 446.0
+          },
+          "GPQA - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - # prompt tokens": {
+            "description": "min=303.265, mean=303.265, max=303.265, sum=303.265 (1)",
+            "tab": "General information",
+            "score": 303.2645739910314
+          },
+          "GPQA - # output tokens": {
+            "description": "min=439.648, mean=439.648, max=439.648, sum=439.648 (1)",
+            "tab": "General information",
+            "score": 439.6479820627803
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "IFEval Strict Acc on IFEval",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.89,
+        "details": {
+          "description": "min=0.89, mean=0.89, max=0.89, sum=0.89 (1)",
+          "tab": "Accuracy",
+          "IFEval - Observed inference time (s)": {
+            "description": "min=13.366, mean=13.366, max=13.366, sum=13.366 (1)",
+            "tab": "Efficiency",
+            "score": 13.366226098453712
+          },
+          "IFEval - # eval": {
+            "description": "min=541, mean=541, max=541, sum=541 (1)",
+            "tab": "General information",
+            "score": 541.0
+          },
+          "IFEval - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - # prompt tokens": {
+            "description": "min=51.534, mean=51.534, max=51.534, sum=51.534 (1)",
+            "tab": "General information",
+            "score": 51.53419593345656
+          },
+          "IFEval - # output tokens": {
+            "description": "min=494.717, mean=494.717, max=494.717, sum=494.717 (1)",
+            "tab": "General information",
+            "score": 494.7171903881701
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "WB Score on WildBench",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.739,
+        "details": {
+          "description": "min=0.739, mean=0.739, max=0.739, sum=0.739 (1)",
+          "tab": "Accuracy",
+          "WildBench - Observed inference time (s)": {
+            "description": "min=30.807, mean=30.807, max=30.807, sum=30.807 (1)",
+            "tab": "Efficiency",
+            "score": 30.80672695994377
+          },
+          "WildBench - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "WildBench - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # prompt tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # output tokens": {
+            "description": "min=996.159, mean=996.159, max=996.159, sum=996.159 (1)",
+            "tab": "General information",
+            "score": 996.159
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "v2"
+        }
+      }
+    },
+    {
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Acc on Omni-MATH",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.296,
+        "details": {
+          "description": "min=0.296, mean=0.296, max=0.296, sum=0.296 (1)",
+          "tab": "Accuracy",
+          "Omni-MATH - Observed inference time (s)": {
+            "description": "min=32.708, mean=32.708, max=32.708, sum=32.708 (1)",
+            "tab": "Efficiency",
+            "score": 32.70806641626358
+          },
+          "Omni-MATH - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "Omni-MATH - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - # prompt tokens": {
+            "description": "min=118.438, mean=118.438, max=118.438, sum=118.438 (1)",
+            "tab": "General information",
+            "score": 118.438
+          },
+          "Omni-MATH - # output tokens": {
+            "description": "min=1020.51, mean=1020.51, max=1020.51, sum=1020.51 (1)",
+            "tab": "General information",
+            "score": 1020.51
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json b/data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json
new file mode 100644
index 000000000..399dbb1e3
--- /dev/null
+++ b/data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json
@@ -0,0 +1,345 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/ibm_granite-4.0-micro/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
+  "source_metadata": {
+    "source_name": "helm_capabilities",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "IBM Granite 4.0 Micro",
+    "id": "ibm/granite-4.0-micro",
+    "developer": "ibm",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "The mean of the scores from all columns.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.486,
+        "details": {
+          "tab": "Accuracy",
+          "Mean score - Efficiency": {
+            "description": null,
+            "tab": "Efficiency",
+            "score": 5.725128505637726
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "COT correct on MMLU-Pro",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.395,
+        "details": {
+          "description": "min=0.395, mean=0.395, max=0.395, sum=0.395 (1)",
+          "tab": "Accuracy",
+          "MMLU-Pro - Observed inference time (s)": {
+            "description": "min=3.135, mean=3.135, max=3.135, sum=3.135 (1)",
+            "tab": "Efficiency",
+            "score": 3.1348352246284485
+          },
+          "MMLU-Pro - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "MMLU-Pro - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - # prompt tokens": {
+            "description": "min=288.391, mean=288.391, max=288.391, sum=288.391 (1)",
+            "tab": "General information",
+            "score": 288.391
+          },
+          "MMLU-Pro - # output tokens": {
+            "description": "min=325.255, mean=325.255, max=325.255, sum=325.255 (1)",
+            "tab": "General information",
+            "score": 325.255
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "COT correct on GPQA",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.307,
+        "details": {
+          "description": "min=0.307, mean=0.307, max=0.307, sum=0.307 (1)",
+          "tab": "Accuracy",
+          "GPQA - Observed inference time (s)": {
+            "description": "min=3.075, mean=3.075, max=3.075, sum=3.075 (1)",
+            "tab": "Efficiency",
+            "score": 3.075281912970436
+          },
+          "GPQA - # eval": {
+            "description": "min=446, mean=446, max=446, sum=446 (1)",
+            "tab": "General information",
+            "score": 446.0
+          },
+          "GPQA - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - # prompt tokens": {
+            "description": "min=303.265, mean=303.265, max=303.265, sum=303.265 (1)",
+            "tab": "General information",
+            "score": 303.2645739910314
+          },
+          "GPQA - # output tokens": {
+            "description": "min=337.417, mean=337.417, max=337.417, sum=337.417 (1)",
+            "tab": "General information",
+            "score": 337.4170403587444
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "IFEval Strict Acc on IFEval",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.849,
+        "details": {
+          "description": "min=0.849, mean=0.849, max=0.849, sum=0.849 (1)",
+          "tab": "Accuracy",
+          "IFEval - Observed inference time (s)": {
+            "description": "min=4.58, mean=4.58, max=4.58, sum=4.58 (1)",
+            "tab": "Efficiency",
+            "score": 4.580414981806785
+          },
+          "IFEval - # eval": {
+            "description": "min=541, mean=541, max=541, sum=541 (1)",
+            "tab": "General information",
+            "score": 541.0
+          },
+          "IFEval - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - # prompt tokens": {
+            "description": "min=51.534, mean=51.534, max=51.534, sum=51.534 (1)",
+            "tab": "General information",
+            "score": 51.53419593345656
+          },
+          "IFEval - # output tokens": {
+            "description": "min=497.8, mean=497.8, max=497.8, sum=497.8 (1)",
+            "tab": "General information",
+            "score": 497.8003696857671
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "WB Score on WildBench",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.67,
+        "details": {
+          "description": "min=0.67, mean=0.67, max=0.67, sum=0.67 (1)",
+          "tab": "Accuracy",
+          "WildBench - Observed inference time (s)": {
+            "description": "min=8.161, mean=8.161, max=8.161, sum=8.161 (1)",
+            "tab": "Efficiency",
+            "score": 8.160923891305924
+          },
+          "WildBench - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "WildBench - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # prompt tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # output tokens": {
+            "description": "min=1037.706, mean=1037.706, max=1037.706, sum=1037.706 (1)",
+            "tab": "General information",
+            "score": 1037.706
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "v2"
+        }
+      }
+    },
+    {
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Acc on Omni-MATH",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.209,
+        "details": {
+          "description": "min=0.209, mean=0.209, max=0.209, sum=0.209 (1)",
+          "tab": "Accuracy",
+          "Omni-MATH - Observed inference time (s)": {
+            "description": "min=9.674, mean=9.674, max=9.674, sum=9.674 (1)",
+            "tab": "Efficiency",
+            "score": 9.674186517477036
+          },
+          "Omni-MATH - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "Omni-MATH - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - # prompt tokens": {
+            "description": "min=118.438, mean=118.438, max=118.438, sum=118.438 (1)",
+            "tab": "General information",
+            "score": 118.438
+          },
+          "Omni-MATH - # output tokens": {
+            "description": "min=1145.889, mean=1145.889, max=1145.889, sum=1145.889 (1)",
+            "tab": "General information",
+            "score": 1145.889
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_capabilities/marin-community/marin-8b-instruct/cc90bae5-b964-4402-9edb-5427663f01fb.json b/data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json
similarity index 75%
rename from data/helm_capabilities/marin-community/marin-8b-instruct/cc90bae5-b964-4402-9edb-5427663f01fb.json
rename to data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json
index 3622da7c6..736686c13 100644
--- a/data/helm_capabilities/marin-community/marin-8b-instruct/cc90bae5-b964-4402-9edb-5427663f01fb.json
+++ b/data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/marin-community_marin-8b-instruct/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/marin-community_marin-8b-instruct/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.325,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,16 +100,25 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -141,16 +162,25 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,13 +224,22 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -244,14 +283,23 @@
         }
       },
       "generation_config": {
-        "subset": "v2",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "v2",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -295,7 +343,9 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     }
   ]
diff --git a/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/2e8f73ba-73d9-43c6-9a26-a3a5b5375e50.json b/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json
similarity index 76%
rename from data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/2e8f73ba-73d9-43c6-9a26-a3a5b5375e50.json
rename to data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json
index 6e7a59864..4dd5465a5 100644
--- a/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/2e8f73ba-73d9-43c6-9a26-a3a5b5375e50.json
+++ b/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/meta_llama-3.1-405b-instruct-turbo/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/meta_llama-3.1-405b-instruct-turbo/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (405B)",
+    "name": "Llama 3.1 Instruct Turbo 405B",
     "id": "meta/llama-3.1-405b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.618,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/9b7139aa-a3e4-496e-9fb8-5c64d15ea945.json b/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json
similarity index 76%
rename from data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/9b7139aa-a3e4-496e-9fb8-5c64d15ea945.json
rename to data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json
index 9ba719da5..407242cbb 100644
--- a/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/9b7139aa-a3e4-496e-9fb8-5c64d15ea945.json
+++ b/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/meta_llama-3.1-70b-instruct-turbo/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/meta_llama-3.1-70b-instruct-turbo/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (70B)",
+    "name": "Llama 3.1 Instruct Turbo 70B",
     "id": "meta/llama-3.1-70b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.574,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/930db2c4-d9c5-4e38-ae80-7304c2f10611.json b/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json
similarity index 76%
rename from data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/930db2c4-d9c5-4e38-ae80-7304c2f10611.json
rename to data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json
index 4657892fd..30524d64b 100644
--- a/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/930db2c4-d9c5-4e38-ae80-7304c2f10611.json
+++ b/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/meta_llama-3.1-8b-instruct-turbo/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/meta_llama-3.1-8b-instruct-turbo/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (8B)",
+    "name": "Llama 3.1 Instruct Turbo 8B",
     "id": "meta/llama-3.1-8b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.444,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/226ce6f9-0cd6-469b-bf8a-f0c322b7f750.json b/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json
similarity index 75%
rename from data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/226ce6f9-0cd6-469b-bf8a-f0c322b7f750.json
rename to data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json
index 9c2141acc..d9ca75120 100644
--- a/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/226ce6f9-0cd6-469b-bf8a-f0c322b7f750.json
+++ b/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/meta_llama-4-maverick-17b-128e-instruct-fp8/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/meta_llama-4-maverick-17b-128e-instruct-fp8/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 4 Maverick (17Bx128E) Instruct FP8",
+    "name": "Llama 4 Maverick 17Bx128E Instruct FP8",
     "id": "meta/llama-4-maverick-17b-128e-instruct-fp8",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.718,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/bb4e408d-505e-46c8-bd0c-7afa44a96498.json b/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json
similarity index 76%
rename from data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/bb4e408d-505e-46c8-bd0c-7afa44a96498.json
rename to data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json
index 2d19156dc..640472423 100644
--- a/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/bb4e408d-505e-46c8-bd0c-7afa44a96498.json
+++ b/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/meta_llama-4-scout-17b-16e-instruct/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/meta_llama-4-scout-17b-16e-instruct/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 4 Scout (17Bx16E) Instruct",
+    "name": "Llama 4 Scout 17Bx16E Instruct",
     "id": "meta/llama-4-scout-17b-16e-instruct",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.644,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/d63dad7a-f7b7-4c87-9712-3043fc117545.json b/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json
similarity index 76%
rename from data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/d63dad7a-f7b7-4c87-9712-3043fc117545.json
rename to data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json
index 6663598e4..0b19a4ab4 100644
--- a/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/d63dad7a-f7b7-4c87-9712-3043fc117545.json
+++ b/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/mistralai_mistral-7b-instruct-v0.3/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/mistralai_mistral-7b-instruct-v0.3/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Instruct v0.3 (7B)",
+    "name": "Mistral Instruct v0.3 7B",
     "id": "mistralai/mistral-7b-instruct-v0.3",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.376,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/mistralai/mistral-large-2411/7e7f739e-9363-4c41-871d-6cf6c4145728.json b/data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json
similarity index 76%
rename from data/helm_capabilities/mistralai/mistral-large-2411/7e7f739e-9363-4c41-871d-6cf6c4145728.json
rename to data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json
index db1fa9b82..dec52ca8a 100644
--- a/data/helm_capabilities/mistralai/mistral-large-2411/7e7f739e-9363-4c41-871d-6cf6c4145728.json
+++ b/data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/mistralai_mistral-large-2411/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/mistralai_mistral-large-2411/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Large (2411)",
+    "name": "Mistral Large 2411",
     "id": "mistralai/mistral-large-2411",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.598,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/mistralai/mistral-small-2503/853d8802-1f0b-463e-b0e8-c98b4c6b60a8.json b/data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json
similarity index 76%
rename from data/helm_capabilities/mistralai/mistral-small-2503/853d8802-1f0b-463e-b0e8-c98b4c6b60a8.json
rename to data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json
index 69ce74931..7999b823d 100644
--- a/data/helm_capabilities/mistralai/mistral-small-2503/853d8802-1f0b-463e-b0e8-c98b4c6b60a8.json
+++ b/data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/mistralai_mistral-small-2503/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/mistralai_mistral-small-2503/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Small 3.1 (2503)",
+    "name": "Mistral Small 3.1 2503",
     "id": "mistralai/mistral-small-2503",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.558,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/b05befca-44a5-45fb-823e-84bcc3ae81d0.json b/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json
similarity index 76%
rename from data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/b05befca-44a5-45fb-823e-84bcc3ae81d0.json
rename to data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json
index 2dfb94872..583f7956f 100644
--- a/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/b05befca-44a5-45fb-823e-84bcc3ae81d0.json
+++ b/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/mistralai_mixtral-8x22b-instruct-v0.1/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/mistralai_mixtral-8x22b-instruct-v0.1/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mixtral Instruct (8x22B)",
+    "name": "Mixtral Instruct 8x22B",
     "id": "mistralai/mixtral-8x22b-instruct-v0.1",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.478,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2b1f7fa1-24df-4fb7-8255-d83992e32b8b.json b/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json
similarity index 76%
rename from data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2b1f7fa1-24df-4fb7-8255-d83992e32b8b.json
rename to data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json
index 293d11168..d2c9cfb4e 100644
--- a/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2b1f7fa1-24df-4fb7-8255-d83992e32b8b.json
+++ b/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/mistralai_mixtral-8x7b-instruct-v0.1/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/mistralai_mixtral-8x7b-instruct-v0.1/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mixtral Instruct (8x7B)",
+    "name": "Mixtral Instruct 8x7B",
     "id": "mistralai/mixtral-8x7b-instruct-v0.1",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.397,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/moonshotai/kimi-k2-instruct/eaeab0d7-4418-4699-9774-bc1c6711b3d3.json b/data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json
similarity index 76%
rename from data/helm_capabilities/moonshotai/kimi-k2-instruct/eaeab0d7-4418-4699-9774-bc1c6711b3d3.json
rename to data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json
index 4c25e86d3..1946db617 100644
--- a/data/helm_capabilities/moonshotai/kimi-k2-instruct/eaeab0d7-4418-4699-9774-bc1c6711b3d3.json
+++ b/data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/moonshotai_kimi-k2-instruct/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/moonshotai_kimi-k2-instruct/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.768,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-4.1-2025-04-14/c58f0e5d-5fe4-4a94-a9a2-7835842482b8.json b/data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json
similarity index 76%
rename from data/helm_capabilities/openai/gpt-4.1-2025-04-14/c58f0e5d-5fe4-4a94-a9a2-7835842482b8.json
rename to data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json
index c005600e1..3c36cb01b 100644
--- a/data/helm_capabilities/openai/gpt-4.1-2025-04-14/c58f0e5d-5fe4-4a94-a9a2-7835842482b8.json
+++ b/data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4.1-2025-04-14/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-4.1-2025-04-14/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4.1 (2025-04-14)",
+    "name": "GPT-4.1 2025-04-14",
     "id": "openai/gpt-4.1-2025-04-14",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.727,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/acaf03fd-9d4b-4fe3-8ffe-88212a786363.json b/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json
similarity index 76%
rename from data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/acaf03fd-9d4b-4fe3-8ffe-88212a786363.json
rename to data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json
index d6481e60a..dd4503511 100644
--- a/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/acaf03fd-9d4b-4fe3-8ffe-88212a786363.json
+++ b/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4.1-mini-2025-04-14/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-4.1-mini-2025-04-14/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4.1 mini (2025-04-14)",
+    "name": "GPT-4.1 mini 2025-04-14",
     "id": "openai/gpt-4.1-mini-2025-04-14",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.726,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/308d3e1d-a1b9-4722-8333-23b840316e3d.json b/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json
similarity index 76%
rename from data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/308d3e1d-a1b9-4722-8333-23b840316e3d.json
rename to data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json
index e878bf385..e2550958a 100644
--- a/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/308d3e1d-a1b9-4722-8333-23b840316e3d.json
+++ b/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4.1-nano-2025-04-14/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-4.1-nano-2025-04-14/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4.1 nano (2025-04-14)",
+    "name": "GPT-4.1 nano 2025-04-14",
     "id": "openai/gpt-4.1-nano-2025-04-14",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.616,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-4o-2024-11-20/84a942b6-2b77-4bc2-859f-6b8d6be93558.json b/data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json
similarity index 76%
rename from data/helm_capabilities/openai/gpt-4o-2024-11-20/84a942b6-2b77-4bc2-859f-6b8d6be93558.json
rename to data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json
index ae08e8732..3c3d40256 100644
--- a/data/helm_capabilities/openai/gpt-4o-2024-11-20/84a942b6-2b77-4bc2-859f-6b8d6be93558.json
+++ b/data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4o-2024-11-20/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-4o-2024-11-20/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4o (2024-11-20)",
+    "name": "GPT-4o 2024-11-20",
     "id": "openai/gpt-4o-2024-11-20",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.634,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7e0e6ec7-1b72-4764-8fa4-f7646b4b93d3.json b/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json
similarity index 76%
rename from data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7e0e6ec7-1b72-4764-8fa4-f7646b4b93d3.json
rename to data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json
index c3aeb8ab5..778449e6e 100644
--- a/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7e0e6ec7-1b72-4764-8fa4-f7646b4b93d3.json
+++ b/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4o-mini-2024-07-18/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-4o-mini-2024-07-18/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4o mini (2024-07-18)",
+    "name": "GPT-4o mini 2024-07-18",
     "id": "openai/gpt-4o-mini-2024-07-18",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.565,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-5-2025-08-07/cb444c37-e273-4aaf-881e-8a433f630053.json b/data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json
similarity index 76%
rename from data/helm_capabilities/openai/gpt-5-2025-08-07/cb444c37-e273-4aaf-881e-8a433f630053.json
rename to data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json
index 2fd77c3d1..95d9762ef 100644
--- a/data/helm_capabilities/openai/gpt-5-2025-08-07/cb444c37-e273-4aaf-881e-8a433f630053.json
+++ b/data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-5-2025-08-07/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-5-2025-08-07/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-5 (2025-08-07)",
+    "name": "GPT-5 2025-08-07",
     "id": "openai/gpt-5-2025-08-07",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.807,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/7af059e2-b56e-46ed-b699-63e570081f16.json b/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json
similarity index 76%
rename from data/helm_capabilities/openai/gpt-5-mini-2025-08-07/7af059e2-b56e-46ed-b699-63e570081f16.json
rename to data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json
index cf4a0414b..5dc165206 100644
--- a/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/7af059e2-b56e-46ed-b699-63e570081f16.json
+++ b/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-5-mini-2025-08-07/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-5-mini-2025-08-07/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-5 mini (2025-08-07)",
+    "name": "GPT-5 mini 2025-08-07",
     "id": "openai/gpt-5-mini-2025-08-07",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.819,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/2dc0b2e4-c412-4c83-8b7a-6ee778e4c421.json b/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json
similarity index 76%
rename from data/helm_capabilities/openai/gpt-5-nano-2025-08-07/2dc0b2e4-c412-4c83-8b7a-6ee778e4c421.json
rename to data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json
index a9996e0cd..096518c62 100644
--- a/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/2dc0b2e4-c412-4c83-8b7a-6ee778e4c421.json
+++ b/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-5-nano-2025-08-07/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-5-nano-2025-08-07/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-5 nano (2025-08-07)",
+    "name": "GPT-5 nano 2025-08-07",
     "id": "openai/gpt-5-nano-2025-08-07",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.748,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json b/data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json
new file mode 100644
index 000000000..738007852
--- /dev/null
+++ b/data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json
@@ -0,0 +1,345 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-5.1-2025-11-13/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
+  "source_metadata": {
+    "source_name": "helm_capabilities",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "GPT-5.1 2025-11-13",
+    "id": "openai/gpt-5.1-2025-11-13",
+    "developer": "openai",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "The mean of the scores from all columns.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.656,
+        "details": {
+          "tab": "Accuracy",
+          "Mean score - Efficiency": {
+            "description": null,
+            "tab": "Efficiency",
+            "score": 10.620566227529599
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "COT correct on MMLU-Pro",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.579,
+        "details": {
+          "description": "min=0.579, mean=0.579, max=0.579, sum=0.579 (1)",
+          "tab": "Accuracy",
+          "MMLU-Pro - Observed inference time (s)": {
+            "description": "min=1.147, mean=1.147, max=1.147, sum=1.147 (1)",
+            "tab": "Efficiency",
+            "score": 1.1470122172832489
+          },
+          "MMLU-Pro - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "MMLU-Pro - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - # prompt tokens": {
+            "description": "min=248.569, mean=248.569, max=248.569, sum=248.569 (1)",
+            "tab": "General information",
+            "score": 248.569
+          },
+          "MMLU-Pro - # output tokens": {
+            "description": "min=5.002, mean=5.002, max=5.002, sum=5.002 (1)",
+            "tab": "General information",
+            "score": 5.002
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "COT correct on GPQA",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.442,
+        "details": {
+          "description": "min=0.442, mean=0.442, max=0.442, sum=0.442 (1)",
+          "tab": "Accuracy",
+          "GPQA - Observed inference time (s)": {
+            "description": "min=1.002, mean=1.002, max=1.002, sum=1.002 (1)",
+            "tab": "Efficiency",
+            "score": 1.002433323539426
+          },
+          "GPQA - # eval": {
+            "description": "min=446, mean=446, max=446, sum=446 (1)",
+            "tab": "General information",
+            "score": 446.0
+          },
+          "GPQA - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - # prompt tokens": {
+            "description": "min=268.152, mean=268.152, max=268.152, sum=268.152 (1)",
+            "tab": "General information",
+            "score": 268.15246636771303
+          },
+          "GPQA - # output tokens": {
+            "description": "min=5.422, mean=5.422, max=5.422, sum=5.422 (1)",
+            "tab": "General information",
+            "score": 5.42152466367713
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "IFEval Strict Acc on IFEval",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.935,
+        "details": {
+          "description": "min=0.935, mean=0.935, max=0.935, sum=0.935 (1)",
+          "tab": "Accuracy",
+          "IFEval - Observed inference time (s)": {
+            "description": "min=13.159, mean=13.159, max=13.159, sum=13.159 (1)",
+            "tab": "Efficiency",
+            "score": 13.15882584436103
+          },
+          "IFEval - # eval": {
+            "description": "min=541, mean=541, max=541, sum=541 (1)",
+            "tab": "General information",
+            "score": 541.0
+          },
+          "IFEval - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - # prompt tokens": {
+            "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)",
+            "tab": "General information",
+            "score": 45.67097966728281
+          },
+          "IFEval - # output tokens": {
+            "description": "min=647.063, mean=647.063, max=647.063, sum=647.063 (1)",
+            "tab": "General information",
+            "score": 647.0628465804067
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "WB Score on WildBench",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.863,
+        "details": {
+          "description": "min=0.863, mean=0.863, max=0.863, sum=0.863 (1)",
+          "tab": "Accuracy",
+          "WildBench - Observed inference time (s)": {
+            "description": "min=28.081, mean=28.081, max=28.081, sum=28.081 (1)",
+            "tab": "Efficiency",
+            "score": 28.08133857488632
+          },
+          "WildBench - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "WildBench - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # prompt tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # output tokens": {
+            "description": "min=2059.716, mean=2059.716, max=2059.716, sum=2059.716 (1)",
+            "tab": "General information",
+            "score": 2059.716
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "v2"
+        }
+      }
+    },
+    {
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Acc on Omni-MATH",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.464,
+        "details": {
+          "description": "min=0.464, mean=0.464, max=0.464, sum=0.464 (1)",
+          "tab": "Accuracy",
+          "Omni-MATH - Observed inference time (s)": {
+            "description": "min=9.713, mean=9.713, max=9.713, sum=9.713 (1)",
+            "tab": "Efficiency",
+            "score": 9.713221177577973
+          },
+          "Omni-MATH - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "Omni-MATH - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - # prompt tokens": {
+            "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)",
+            "tab": "General information",
+            "score": 109.623
+          },
+          "Omni-MATH - # output tokens": {
+            "description": "min=1256.266, mean=1256.266, max=1256.266, sum=1256.266 (1)",
+            "tab": "General information",
+            "score": 1256.266
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-oss-120b/e9a85dec-b32a-4f7f-ad66-a4bdc314501e.json b/data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json
similarity index 76%
rename from data/helm_capabilities/openai/gpt-oss-120b/e9a85dec-b32a-4f7f-ad66-a4bdc314501e.json
rename to data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json
index 0b6f0418d..8642e9954 100644
--- a/data/helm_capabilities/openai/gpt-oss-120b/e9a85dec-b32a-4f7f-ad66-a4bdc314501e.json
+++ b/data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-oss-120b/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-oss-120b/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.77,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-oss-20b/acb07214-c0f3-4006-8a3b-23793891a1bf.json b/data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json
similarity index 76%
rename from data/helm_capabilities/openai/gpt-oss-20b/acb07214-c0f3-4006-8a3b-23793891a1bf.json
rename to data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json
index 36043d89a..5112d535f 100644
--- a/data/helm_capabilities/openai/gpt-oss-20b/acb07214-c0f3-4006-8a3b-23793891a1bf.json
+++ b/data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-oss-20b/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-oss-20b/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.674,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/o3-2025-04-16/a1c5d581-be98-4e1e-ba14-ca922bfac035.json b/data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json
similarity index 75%
rename from data/helm_capabilities/openai/o3-2025-04-16/a1c5d581-be98-4e1e-ba14-ca922bfac035.json
rename to data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json
index 2d017bb31..677721448 100644
--- a/data/helm_capabilities/openai/o3-2025-04-16/a1c5d581-be98-4e1e-ba14-ca922bfac035.json
+++ b/data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_o3-2025-04-16/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_o3-2025-04-16/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "o3 (2025-04-16)",
+    "name": "o3 2025-04-16",
     "id": "openai/o3-2025-04-16",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.811,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/o4-mini-2025-04-16/c7b6ae15-cfe1-4bbd-a4d1-d45465b74081.json b/data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json
similarity index 76%
rename from data/helm_capabilities/openai/o4-mini-2025-04-16/c7b6ae15-cfe1-4bbd-a4d1-d45465b74081.json
rename to data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json
index db654a7b8..fd4ae16c5 100644
--- a/data/helm_capabilities/openai/o4-mini-2025-04-16/c7b6ae15-cfe1-4bbd-a4d1-d45465b74081.json
+++ b/data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_o4-mini-2025-04-16/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_o4-mini-2025-04-16/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "o4-mini (2025-04-16)",
+    "name": "o4-mini 2025-04-16",
     "id": "openai/o4-mini-2025-04-16",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.812,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/f6d74c93-0e96-4fc5-987c-18a79dbde17c.json b/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json
similarity index 76%
rename from data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/f6d74c93-0e96-4fc5-987c-18a79dbde17c.json
rename to data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json
index 7bc9ee7ae..50778c699 100644
--- a/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/f6d74c93-0e96-4fc5-987c-18a79dbde17c.json
+++ b/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen2.5-72b-instruct-turbo/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/qwen_qwen2.5-72b-instruct-turbo/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen2.5 Instruct Turbo (72B)",
+    "name": "Qwen2.5 Instruct Turbo 72B",
     "id": "qwen/qwen2.5-72b-instruct-turbo",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.599,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/f96da103-5350-4b1b-b33e-6ced1f1f7815.json b/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json
similarity index 76%
rename from data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/f96da103-5350-4b1b-b33e-6ced1f1f7815.json
rename to data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json
index 921d79480..c974f1019 100644
--- a/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/f96da103-5350-4b1b-b33e-6ced1f1f7815.json
+++ b/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen2.5-7b-instruct-turbo/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/qwen_qwen2.5-7b-instruct-turbo/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen2.5 Instruct Turbo (7B)",
+    "name": "Qwen2.5 Instruct Turbo 7B",
     "id": "qwen/qwen2.5-7b-instruct-turbo",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.529,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/27bae7f2-92dd-4feb-9050-2d11c6da2d61.json b/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json
similarity index 76%
rename from data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/27bae7f2-92dd-4feb-9050-2d11c6da2d61.json
rename to data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json
index 7bc1c5881..9ded60c84 100644
--- a/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/27bae7f2-92dd-4feb-9050-2d11c6da2d61.json
+++ b/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-fp8-tput/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-fp8-tput/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.726,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/0524c7a5-aad2-41d9-b7fb-1d07f8f13846.json b/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json
similarity index 76%
rename from data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/0524c7a5-aad2-41d9-b7fb-1d07f8f13846.json
rename to data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json
index 355119fa7..0210712c3 100644
--- a/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/0524c7a5-aad2-41d9-b7fb-1d07f8f13846.json
+++ b/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-instruct-2507-fp8/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-instruct-2507-fp8/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.798,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json b/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json
new file mode 100644
index 000000000..6ee69548e
--- /dev/null
+++ b/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json
@@ -0,0 +1,345 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/qwen_qwen3-next-80b-a3b-thinking/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
+  "source_metadata": {
+    "source_name": "helm_capabilities",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "Qwen3-Next 80B A3B Thinking",
+    "id": "qwen/qwen3-next-80b-a3b-thinking",
+    "developer": "qwen",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "The mean of the scores from all columns.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.7,
+        "details": {
+          "tab": "Accuracy",
+          "Mean score - Efficiency": {
+            "description": null,
+            "tab": "Efficiency",
+            "score": 27.61164260375731
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "COT correct on MMLU-Pro",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.786,
+        "details": {
+          "description": "min=0.786, mean=0.786, max=0.786, sum=0.786 (1)",
+          "tab": "Accuracy",
+          "MMLU-Pro - Observed inference time (s)": {
+            "description": "min=20.097, mean=20.097, max=20.097, sum=20.097 (1)",
+            "tab": "Efficiency",
+            "score": 20.09722422862053
+          },
+          "MMLU-Pro - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "MMLU-Pro - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - # prompt tokens": {
+            "description": "min=259.715, mean=259.715, max=259.715, sum=259.715 (1)",
+            "tab": "General information",
+            "score": 259.715
+          },
+          "MMLU-Pro - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "COT correct on GPQA",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.63,
+        "details": {
+          "description": "min=0.63, mean=0.63, max=0.63, sum=0.63 (1)",
+          "tab": "Accuracy",
+          "GPQA - Observed inference time (s)": {
+            "description": "min=40.06, mean=40.06, max=40.06, sum=40.06 (1)",
+            "tab": "Efficiency",
+            "score": 40.06039341950096
+          },
+          "GPQA - # eval": {
+            "description": "min=446, mean=446, max=446, sum=446 (1)",
+            "tab": "General information",
+            "score": 446.0
+          },
+          "GPQA - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - # prompt tokens": {
+            "description": "min=274.37, mean=274.37, max=274.37, sum=274.37 (1)",
+            "tab": "General information",
+            "score": 274.36995515695065
+          },
+          "GPQA - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "IFEval Strict Acc on IFEval",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.81,
+        "details": {
+          "description": "min=0.81, mean=0.81, max=0.81, sum=0.81 (1)",
+          "tab": "Accuracy",
+          "IFEval - Observed inference time (s)": {
+            "description": "min=13.893, mean=13.893, max=13.893, sum=13.893 (1)",
+            "tab": "Efficiency",
+            "score": 13.89268838323639
+          },
+          "IFEval - # eval": {
+            "description": "min=541, mean=541, max=541, sum=541 (1)",
+            "tab": "General information",
+            "score": 541.0
+          },
+          "IFEval - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - # prompt tokens": {
+            "description": "min=46.492, mean=46.492, max=46.492, sum=46.492 (1)",
+            "tab": "General information",
+            "score": 46.491682070240294
+          },
+          "IFEval - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "WB Score on WildBench",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.807,
+        "details": {
+          "description": "min=0.807, mean=0.807, max=0.807, sum=0.807 (1)",
+          "tab": "Accuracy",
+          "WildBench - Observed inference time (s)": {
+            "description": "min=23.095, mean=23.095, max=23.095, sum=23.095 (1)",
+            "tab": "Efficiency",
+            "score": 23.095464605808257
+          },
+          "WildBench - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "WildBench - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # prompt tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "v2"
+        }
+      }
+    },
+    {
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Acc on Omni-MATH",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.467,
+        "details": {
+          "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)",
+          "tab": "Accuracy",
+          "Omni-MATH - Observed inference time (s)": {
+            "description": "min=40.912, mean=40.912, max=40.912, sum=40.912 (1)",
+            "tab": "Efficiency",
+            "score": 40.91244238162041
+          },
+          "Omni-MATH - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "Omni-MATH - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - # prompt tokens": {
+            "description": "min=111.6, mean=111.6, max=111.6, sum=111.6 (1)",
+            "tab": "General information",
+            "score": 111.6
+          },
+          "Omni-MATH - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_capabilities/writer/palmyra-fin/39e948ed-a41e-4fde-aa25-9ceb84fdf0b9.json b/data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json
similarity index 76%
rename from data/helm_capabilities/writer/palmyra-fin/39e948ed-a41e-4fde-aa25-9ceb84fdf0b9.json
rename to data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json
index cf2b63d2e..b86fc5b45 100644
--- a/data/helm_capabilities/writer/palmyra-fin/39e948ed-a41e-4fde-aa25-9ceb84fdf0b9.json
+++ b/data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/writer_palmyra-fin/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/writer_palmyra-fin/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.577,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/writer/palmyra-med/1d046894-4412-4e5a-a6e1-8b30d9dd7b57.json b/data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json
similarity index 75%
rename from data/helm_capabilities/writer/palmyra-med/1d046894-4412-4e5a-a6e1-8b30d9dd7b57.json
rename to data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json
index 0d8108574..ac68f722a 100644
--- a/data/helm_capabilities/writer/palmyra-med/1d046894-4412-4e5a-a6e1-8b30d9dd7b57.json
+++ b/data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/writer_palmyra-med/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/writer_palmyra-med/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.476,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/writer/palmyra-x-004/01d3d6e7-5ca0-4ae8-8d03-b3c83c59fe6f.json b/data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json
similarity index 76%
rename from data/helm_capabilities/writer/palmyra-x-004/01d3d6e7-5ca0-4ae8-8d03-b3c83c59fe6f.json
rename to data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json
index 8b3240898..9398b6319 100644
--- a/data/helm_capabilities/writer/palmyra-x-004/01d3d6e7-5ca0-4ae8-8d03-b3c83c59fe6f.json
+++ b/data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/writer_palmyra-x-004/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/writer_palmyra-x-004/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.609,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/writer/palmyra-x5/c79b1007-a4f9-45f0-945c-d9e9bef65d2d.json b/data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json
similarity index 75%
rename from data/helm_capabilities/writer/palmyra-x5/c79b1007-a4f9-45f0-945c-d9e9bef65d2d.json
rename to data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json
index ba834a256..6d3707107 100644
--- a/data/helm_capabilities/writer/palmyra-x5/c79b1007-a4f9-45f0-945c-d9e9bef65d2d.json
+++ b/data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/writer_palmyra-x5/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/writer_palmyra-x5/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.696,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/xai/grok-3-beta/24efb1b7-f34d-4ee8-8f90-deb8d44d24cd.json b/data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json
similarity index 76%
rename from data/helm_capabilities/xai/grok-3-beta/24efb1b7-f34d-4ee8-8f90-deb8d44d24cd.json
rename to data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json
index 7640dfe10..54503d043 100644
--- a/data/helm_capabilities/xai/grok-3-beta/24efb1b7-f34d-4ee8-8f90-deb8d44d24cd.json
+++ b/data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/xai_grok-3-beta/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/xai_grok-3-beta/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.727,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/xai/grok-3-mini-beta/b028eaaf-bc4d-4918-8464-f8c4b0c74973.json b/data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json
similarity index 76%
rename from data/helm_capabilities/xai/grok-3-mini-beta/b028eaaf-bc4d-4918-8464-f8c4b0c74973.json
rename to data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json
index 8570e4d80..a083c0183 100644
--- a/data/helm_capabilities/xai/grok-3-mini-beta/b028eaaf-bc4d-4918-8464-f8c4b0c74973.json
+++ b/data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/xai_grok-3-mini-beta/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/xai_grok-3-mini-beta/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.679,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/xai/grok-4-0709/c7d55b2e-64a2-4e1d-ae18-3f60b365866d.json b/data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json
similarity index 75%
rename from data/helm_capabilities/xai/grok-4-0709/c7d55b2e-64a2-4e1d-ae18-3f60b365866d.json
rename to data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json
index b9fbeb3c0..a25562cb1 100644
--- a/data/helm_capabilities/xai/grok-4-0709/c7d55b2e-64a2-4e1d-ae18-3f60b365866d.json
+++ b/data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/xai_grok-4-0709/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/xai_grok-4-0709/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Grok 4 (0709)",
+    "name": "Grok 4 0709",
     "id": "xai/grok-4-0709",
     "developer": "xai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.785,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/zai-org/glm-4.5-air-fp8/7b231b0d-89b8-4a0a-825e-ccfea212f565.json b/data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json
similarity index 76%
rename from data/helm_capabilities/zai-org/glm-4.5-air-fp8/7b231b0d-89b8-4a0a-825e-ccfea212f565.json
rename to data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json
index b4d4807f0..43a98dd63 100644
--- a/data/helm_capabilities/zai-org/glm-4.5-air-fp8/7b231b0d-89b8-4a0a-825e-ccfea212f565.json
+++ b/data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/zai-org_glm-4.5-air-fp8/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/zai-org_glm-4.5-air-fp8/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.67,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,12 +42,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -88,15 +100,24 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -140,15 +161,24 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -191,12 +221,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -240,13 +279,22 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/anthropic/Anthropic-LM-v4-s3-52B/efb110ab-85c5-49f5-af6f-9beecf46a7d4.json b/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json
similarity index 89%
rename from data/helm_classic/anthropic/Anthropic-LM-v4-s3-52B/efb110ab-85c5-49f5-af6f-9beecf46a7d4.json
rename to data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json
index fac51642a..152223193 100644
--- a/data/helm_classic/anthropic/Anthropic-LM-v4-s3-52B/efb110ab-85c5-49f5-af6f-9beecf46a7d4.json
+++ b/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/anthropic_Anthropic-LM-v4-s3-52B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/Anthropic-LM-v4-s3-52B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Anthropic-LM v4-s3 52B",
-    "id": "anthropic/Anthropic-LM-v4-s3-52B",
-    "developer": "anthropic",
+    "id": "Anthropic-LM-v4-s3-52B",
+    "developer": "unknown",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.78,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/ai21/J1-Grande-v1-17B/09f5c502-2950-48fb-b25f-b562eeee26c8.json b/data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json
similarity index 89%
rename from data/helm_classic/ai21/J1-Grande-v1-17B/09f5c502-2950-48fb-b25f-b562eeee26c8.json
rename to data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json
index 05d951313..6a9a41b41 100644
--- a/data/helm_classic/ai21/J1-Grande-v1-17B/09f5c502-2950-48fb-b25f-b562eeee26c8.json
+++ b/data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/ai21_J1-Grande-v1-17B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/ai21_J1-Grande-v1-17B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.433,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/ai21/J1-Grande-v2-beta-17B/3d13f9ba-b18e-4b52-b28d-9aed0621903d.json b/data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json
similarity index 89%
rename from data/helm_classic/ai21/J1-Grande-v2-beta-17B/3d13f9ba-b18e-4b52-b28d-9aed0621903d.json
rename to data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json
index cc58c06c0..30c92ab94 100644
--- a/data/helm_classic/ai21/J1-Grande-v2-beta-17B/3d13f9ba-b18e-4b52-b28d-9aed0621903d.json
+++ b/data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/ai21_J1-Grande-v2-beta-17B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/ai21_J1-Grande-v2-beta-17B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.706,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/ai21/J1-Jumbo-v1-178B/3c427293-0f3d-4aa8-ac62-4ed484dd74eb.json b/data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json
similarity index 89%
rename from data/helm_classic/ai21/J1-Jumbo-v1-178B/3c427293-0f3d-4aa8-ac62-4ed484dd74eb.json
rename to data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json
index 0be03d012..df8111bcc 100644
--- a/data/helm_classic/ai21/J1-Jumbo-v1-178B/3c427293-0f3d-4aa8-ac62-4ed484dd74eb.json
+++ b/data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/ai21_J1-Jumbo-v1-178B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/ai21_J1-Jumbo-v1-178B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.517,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/ai21/J1-Large-v1-7.5B/1ab7f23a-7527-4188-9141-852f5123eb19.json b/data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json
similarity index 89%
rename from data/helm_classic/ai21/J1-Large-v1-7.5B/1ab7f23a-7527-4188-9141-852f5123eb19.json
rename to data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json
index 3239df52d..5c8560533 100644
--- a/data/helm_classic/ai21/J1-Large-v1-7.5B/1ab7f23a-7527-4188-9141-852f5123eb19.json
+++ b/data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/ai21_J1-Large-v1-7.5B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/ai21_J1-Large-v1-7.5B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.285,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/ai21/Jurassic-2-Grande-17B/f91e7178-50e2-4ad8-9ad5-2f37a29ee9e7.json b/data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json
similarity index 89%
rename from data/helm_classic/ai21/Jurassic-2-Grande-17B/f91e7178-50e2-4ad8-9ad5-2f37a29ee9e7.json
rename to data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json
index 71ff2dc38..4f288f894 100644
--- a/data/helm_classic/ai21/Jurassic-2-Grande-17B/f91e7178-50e2-4ad8-9ad5-2f37a29ee9e7.json
+++ b/data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/ai21_Jurassic-2-Grande-17B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/ai21_Jurassic-2-Grande-17B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.743,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/ac5c97b3-3411-4f8d-9cb3-b6b0a540e3bd.json b/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json
similarity index 89%
rename from data/helm_classic/ai21/Jurassic-2-Jumbo-178B/ac5c97b3-3411-4f8d-9cb3-b6b0a540e3bd.json
rename to data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json
index ab1f54c90..6d0308b9f 100644
--- a/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/ac5c97b3-3411-4f8d-9cb3-b6b0a540e3bd.json
+++ b/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/ai21_Jurassic-2-Jumbo-178B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/ai21_Jurassic-2-Jumbo-178B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.824,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/ai21/Jurassic-2-Large-7.5B/67114722-a441-478b-a324-2c32be7e06a7.json b/data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json
similarity index 89%
rename from data/helm_classic/ai21/Jurassic-2-Large-7.5B/67114722-a441-478b-a324-2c32be7e06a7.json
rename to data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json
index 14e3a243d..4278cef81 100644
--- a/data/helm_classic/ai21/Jurassic-2-Large-7.5B/67114722-a441-478b-a324-2c32be7e06a7.json
+++ b/data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/ai21_Jurassic-2-Large-7.5B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/ai21_Jurassic-2-Large-7.5B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.553,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/aleph-alpha/Luminous-Base-13B/07fa437f-398d-48ab-a74d-b8c59caf3add.json b/data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json
similarity index 89%
rename from data/helm_classic/aleph-alpha/Luminous-Base-13B/07fa437f-398d-48ab-a74d-b8c59caf3add.json
rename to data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json
index 9fccefc67..7e02805f7 100644
--- a/data/helm_classic/aleph-alpha/Luminous-Base-13B/07fa437f-398d-48ab-a74d-b8c59caf3add.json
+++ b/data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Base-13B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Base-13B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.315,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/aleph-alpha/Luminous-Extended-30B/7492964a-2c16-4261-aaca-dbcd4f3be7c3.json b/data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json
similarity index 89%
rename from data/helm_classic/aleph-alpha/Luminous-Extended-30B/7492964a-2c16-4261-aaca-dbcd4f3be7c3.json
rename to data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json
index 9f9536338..d6f8fa8ea 100644
--- a/data/helm_classic/aleph-alpha/Luminous-Extended-30B/7492964a-2c16-4261-aaca-dbcd4f3be7c3.json
+++ b/data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Extended-30B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Extended-30B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.485,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/b5dace02-416d-4b90-90e1-562b22820784.json b/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json
similarity index 89%
rename from data/helm_classic/aleph-alpha/Luminous-Supreme-70B/b5dace02-416d-4b90-90e1-562b22820784.json
rename to data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json
index ed0fa9dcd..5680298fb 100644
--- a/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/b5dace02-416d-4b90-90e1-562b22820784.json
+++ b/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Supreme-70B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Supreme-70B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.662,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/bigscience/BLOOM-176B/0e6cd483-dff8-4fba-9239-82cb0fe34d42.json b/data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json
similarity index 89%
rename from data/helm_classic/bigscience/BLOOM-176B/0e6cd483-dff8-4fba-9239-82cb0fe34d42.json
rename to data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json
index 19831593f..caffd542e 100644
--- a/data/helm_classic/bigscience/BLOOM-176B/0e6cd483-dff8-4fba-9239-82cb0fe34d42.json
+++ b/data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/bigscience_BLOOM-176B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/bigscience_BLOOM-176B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.446,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/bigscience/T0pp-11B/9ae59291-604f-4527-812a-a3150a1098f2.json b/data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json
similarity index 90%
rename from data/helm_classic/bigscience/T0pp-11B/9ae59291-604f-4527-812a-a3150a1098f2.json
rename to data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json
index af37640ca..400f064d5 100644
--- a/data/helm_classic/bigscience/T0pp-11B/9ae59291-604f-4527-812a-a3150a1098f2.json
+++ b/data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/bigscience_T0pp-11B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/bigscience_T0pp-11B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.197,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-Command-beta-52.4B/52026df3-2452-4fd2-a10b-73a2bfc5397e.json b/data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json
similarity index 89%
rename from data/helm_classic/cohere/Cohere-Command-beta-52.4B/52026df3-2452-4fd2-a10b-73a2bfc5397e.json
rename to data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json
index 5eb323191..25f29c7e2 100644
--- a/data/helm_classic/cohere/Cohere-Command-beta-52.4B/52026df3-2452-4fd2-a10b-73a2bfc5397e.json
+++ b/data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-52.4B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-52.4B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.874,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-Command-beta-6.1B/19b97859-5af3-4883-a878-93d026c29d87.json b/data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json
similarity index 89%
rename from data/helm_classic/cohere/Cohere-Command-beta-6.1B/19b97859-5af3-4883-a878-93d026c29d87.json
rename to data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json
index d20d6332d..8f01acff1 100644
--- a/data/helm_classic/cohere/Cohere-Command-beta-6.1B/19b97859-5af3-4883-a878-93d026c29d87.json
+++ b/data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-6.1B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-6.1B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.675,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/37af5185-3599-49f5-9637-55d41bc6ae81.json b/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json
similarity index 89%
rename from data/helm_classic/cohere/Cohere-large-v20220720-13.1B/37af5185-3599-49f5-9637-55d41bc6ae81.json
rename to data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json
index 54182b504..16c06b937 100644
--- a/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/37af5185-3599-49f5-9637-55d41bc6ae81.json
+++ b/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-large-v20220720-13.1B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/cohere_Cohere-large-v20220720-13.1B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.372,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/cf32b49f-7cf8-43a3-9e28-ade7446272ab.json b/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json
similarity index 89%
rename from data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/cf32b49f-7cf8-43a3-9e28-ade7446272ab.json
rename to data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json
index ecba92b3a..f0d42b850 100644
--- a/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/cf32b49f-7cf8-43a3-9e28-ade7446272ab.json
+++ b/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-medium-v20220720-6.1B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/cohere_Cohere-medium-v20220720-6.1B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.23,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/ad9bd354-01d9-4a21-a299-a53190e1eb7e.json b/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json
similarity index 89%
rename from data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/ad9bd354-01d9-4a21-a299-a53190e1eb7e.json
rename to data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json
index 0b33b0763..43f986e70 100644
--- a/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/ad9bd354-01d9-4a21-a299-a53190e1eb7e.json
+++ b/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-medium-v20221108-6.1B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/cohere_Cohere-medium-v20221108-6.1B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.312,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-small-v20220720-410M/12e7dc67-ae33-4f8c-b7df-7cd7d1b58694.json b/data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json
similarity index 89%
rename from data/helm_classic/cohere/Cohere-small-v20220720-410M/12e7dc67-ae33-4f8c-b7df-7cd7d1b58694.json
rename to data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json
index 4abc0c79b..adaaa9403 100644
--- a/data/helm_classic/cohere/Cohere-small-v20220720-410M/12e7dc67-ae33-4f8c-b7df-7cd7d1b58694.json
+++ b/data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-small-v20220720-410M/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/cohere_Cohere-small-v20220720-410M/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.109,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/ce112061-bfa6-4c71-a0f5-3c7f3cf1a560.json b/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json
similarity index 89%
rename from data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/ce112061-bfa6-4c71-a0f5-3c7f3cf1a560.json
rename to data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json
index 6c362be4c..80b637746 100644
--- a/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/ce112061-bfa6-4c71-a0f5-3c7f3cf1a560.json
+++ b/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20220609-52.4B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20220609-52.4B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.56,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/d75d1c98-226a-42cb-9bf3-a8e59ba7f971.json b/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json
similarity index 89%
rename from data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/d75d1c98-226a-42cb-9bf3-a8e59ba7f971.json
rename to data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json
index f92b78094..cc49de0c7 100644
--- a/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/d75d1c98-226a-42cb-9bf3-a8e59ba7f971.json
+++ b/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20221108-52.4B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20221108-52.4B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.664,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/eleuther-ai/Pythia-12B/1143ee64-20a0-41f5-a5fb-35e620889662.json b/data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json
similarity index 88%
rename from data/helm_classic/eleuther-ai/Pythia-12B/1143ee64-20a0-41f5-a5fb-35e620889662.json
rename to data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json
index f6f9d6eae..bc304945b 100644
--- a/data/helm_classic/eleuther-ai/Pythia-12B/1143ee64-20a0-41f5-a5fb-35e620889662.json
+++ b/data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/eleuther-ai_Pythia-12B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/eleutherai_Pythia-12B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Pythia 12B",
-    "id": "eleuther-ai/Pythia-12B",
-    "developer": "eleuther-ai",
+    "id": "eleutherai/Pythia-12B",
+    "developer": "eleutherai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.257,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/eleuther-ai/Pythia-6.9B/b454af07-11be-48b4-a3c2-032716cdf250.json b/data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json
similarity index 88%
rename from data/helm_classic/eleuther-ai/Pythia-6.9B/b454af07-11be-48b4-a3c2-032716cdf250.json
rename to data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json
index 2b488fa6f..511816a71 100644
--- a/data/helm_classic/eleuther-ai/Pythia-6.9B/b454af07-11be-48b4-a3c2-032716cdf250.json
+++ b/data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/eleuther-ai_Pythia-6.9B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/eleutherai_Pythia-6.9B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Pythia 6.9B",
-    "id": "eleuther-ai/Pythia-6.9B",
-    "developer": "eleuther-ai",
+    "id": "eleutherai/Pythia-6.9B",
+    "developer": "eleutherai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.196,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/writer/Palmyra-X-43B/89ef08bb-e26e-4073-9179-79cd08f3bb4b.json b/data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json
similarity index 88%
rename from data/helm_classic/writer/Palmyra-X-43B/89ef08bb-e26e-4073-9179-79cd08f3bb4b.json
rename to data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json
index 725954e16..8d33e45b6 100644
--- a/data/helm_classic/writer/Palmyra-X-43B/89ef08bb-e26e-4073-9179-79cd08f3bb4b.json
+++ b/data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/writer_Palmyra-X-43B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/google_Palmyra-X-43B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Palmyra X 43B",
-    "id": "writer/Palmyra-X-43B",
-    "developer": "writer",
+    "id": "google/Palmyra-X-43B",
+    "developer": "google",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.732,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/google/T5-11B/df0694c5-fca3-48dc-8c6a-0ed477fa08f5.json b/data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json
similarity index 89%
rename from data/helm_classic/google/T5-11B/df0694c5-fca3-48dc-8c6a-0ed477fa08f5.json
rename to data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json
index 9bacd9bf9..2a710defd 100644
--- a/data/helm_classic/google/T5-11B/df0694c5-fca3-48dc-8c6a-0ed477fa08f5.json
+++ b/data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/google_T5-11B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/google_T5-11B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.131,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/google/UL2-20B/ac49ac68-0d7f-4972-bb99-0332b14df2d5.json b/data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json
similarity index 89%
rename from data/helm_classic/google/UL2-20B/ac49ac68-0d7f-4972-bb99-0332b14df2d5.json
rename to data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json
index c9bf42a12..bb571aece 100644
--- a/data/helm_classic/google/UL2-20B/ac49ac68-0d7f-4972-bb99-0332b14df2d5.json
+++ b/data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/google_UL2-20B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/google_UL2-20B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.167,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/lmsys/Vicuna-v1.3-13B/39f4648c-6635-4ffa-86f5-040e69f3e054.json b/data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json
similarity index 88%
rename from data/helm_classic/lmsys/Vicuna-v1.3-13B/39f4648c-6635-4ffa-86f5-040e69f3e054.json
rename to data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json
index 65a179431..e1d9662a3 100644
--- a/data/helm_classic/lmsys/Vicuna-v1.3-13B/39f4648c-6635-4ffa-86f5-040e69f3e054.json
+++ b/data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-13B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-13B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.706,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/lmsys/Vicuna-v1.3-7B/4ef38a9d-283c-4549-8de3-d04ce7f62542.json b/data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json
similarity index 88%
rename from data/helm_classic/lmsys/Vicuna-v1.3-7B/4ef38a9d-283c-4549-8de3-d04ce7f62542.json
rename to data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json
index bf5b7f8ab..b03d7afe6 100644
--- a/data/helm_classic/lmsys/Vicuna-v1.3-7B/4ef38a9d-283c-4549-8de3-d04ce7f62542.json
+++ b/data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.625,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/LLaMA-13B/81eee874-47be-4a55-af47-5b3e1bcbd361.json b/data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json
similarity index 88%
rename from data/helm_classic/meta/LLaMA-13B/81eee874-47be-4a55-af47-5b3e1bcbd361.json
rename to data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json
index b007605c7..959b52195 100644
--- a/data/helm_classic/meta/LLaMA-13B/81eee874-47be-4a55-af47-5b3e1bcbd361.json
+++ b/data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_LLaMA-13B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_LLaMA-13B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.595,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/LLaMA-30B/2a23b568-daed-4783-9c51-5218216f5f19.json b/data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json
similarity index 88%
rename from data/helm_classic/meta/LLaMA-30B/2a23b568-daed-4783-9c51-5218216f5f19.json
rename to data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json
index 8e6647f52..7f604e015 100644
--- a/data/helm_classic/meta/LLaMA-30B/2a23b568-daed-4783-9c51-5218216f5f19.json
+++ b/data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_LLaMA-30B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_LLaMA-30B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.781,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/LLaMA-65B/584cb697-ab7c-4e9a-8eea-6d79d81a9d7e.json b/data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json
similarity index 88%
rename from data/helm_classic/meta/LLaMA-65B/584cb697-ab7c-4e9a-8eea-6d79d81a9d7e.json
rename to data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json
index 1dbaa6d85..ad8c1c451 100644
--- a/data/helm_classic/meta/LLaMA-65B/584cb697-ab7c-4e9a-8eea-6d79d81a9d7e.json
+++ b/data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_LLaMA-65B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_LLaMA-65B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.908,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/LLaMA-7B/6a2445e0-75d4-4434-aabd-645fd445a920.json b/data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json
similarity index 88%
rename from data/helm_classic/meta/LLaMA-7B/6a2445e0-75d4-4434-aabd-645fd445a920.json
rename to data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json
index 4a772fb18..152b9e683 100644
--- a/data/helm_classic/meta/LLaMA-7B/6a2445e0-75d4-4434-aabd-645fd445a920.json
+++ b/data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_LLaMA-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_LLaMA-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.533,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/Llama-2-13B/f5d57067-8a00-490f-b1bf-30afd0b0f126.json b/data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json
similarity index 88%
rename from data/helm_classic/meta/Llama-2-13B/f5d57067-8a00-490f-b1bf-30afd0b0f126.json
rename to data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json
index de40c742e..f2cd54e60 100644
--- a/data/helm_classic/meta/Llama-2-13B/f5d57067-8a00-490f-b1bf-30afd0b0f126.json
+++ b/data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_Llama-2-13B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_Llama-2-13B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.823,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/Llama-2-70B/cb8802af-613e-42a1-b025-31532996eb10.json b/data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json
similarity index 88%
rename from data/helm_classic/meta/Llama-2-70B/cb8802af-613e-42a1-b025-31532996eb10.json
rename to data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json
index 77f6938f9..de031e670 100644
--- a/data/helm_classic/meta/Llama-2-70B/cb8802af-613e-42a1-b025-31532996eb10.json
+++ b/data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_Llama-2-70B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_Llama-2-70B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.944,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/Llama-2-7B/ff02bc45-8476-4ea6-96d8-78ff6a0e0064.json b/data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json
similarity index 88%
rename from data/helm_classic/meta/Llama-2-7B/ff02bc45-8476-4ea6-96d8-78ff6a0e0064.json
rename to data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json
index 3b3b39208..eac315fea 100644
--- a/data/helm_classic/meta/Llama-2-7B/ff02bc45-8476-4ea6-96d8-78ff6a0e0064.json
+++ b/data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_Llama-2-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_Llama-2-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.607,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/OPT-175B/75a5843f-73a4-4ff3-94b5-184152ff703c.json b/data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json
similarity index 89%
rename from data/helm_classic/meta/OPT-175B/75a5843f-73a4-4ff3-94b5-184152ff703c.json
rename to data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json
index 0da99434e..63a0c348d 100644
--- a/data/helm_classic/meta/OPT-175B/75a5843f-73a4-4ff3-94b5-184152ff703c.json
+++ b/data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_OPT-175B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_OPT-175B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.609,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/OPT-66B/83d19197-aebd-43fa-a7ed-20818a9e5d8e.json b/data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json
similarity index 89%
rename from data/helm_classic/meta/OPT-66B/83d19197-aebd-43fa-a7ed-20818a9e5d8e.json
rename to data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json
index 929a020d2..2f3d2ad96 100644
--- a/data/helm_classic/meta/OPT-66B/83d19197-aebd-43fa-a7ed-20818a9e5d8e.json
+++ b/data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_OPT-66B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_OPT-66B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.448,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/microsoft/TNLG-v2-530B/dd121d07-5198-4ac6-81d6-df38485bff25.json b/data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json
similarity index 89%
rename from data/helm_classic/microsoft/TNLG-v2-530B/dd121d07-5198-4ac6-81d6-df38485bff25.json
rename to data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json
index 786e640a5..ddcfa82ef 100644
--- a/data/helm_classic/microsoft/TNLG-v2-530B/dd121d07-5198-4ac6-81d6-df38485bff25.json
+++ b/data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/microsoft_TNLG-v2-530B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/microsoft_TNLG-v2-530B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.787,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/microsoft/TNLG-v2-6.7B/f23680f4-8b5a-4baf-9e8d-74f0f4847183.json b/data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json
similarity index 89%
rename from data/helm_classic/microsoft/TNLG-v2-6.7B/f23680f4-8b5a-4baf-9e8d-74f0f4847183.json
rename to data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json
index ade6f8a0a..b3f527a04 100644
--- a/data/helm_classic/microsoft/TNLG-v2-6.7B/f23680f4-8b5a-4baf-9e8d-74f0f4847183.json
+++ b/data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/microsoft_TNLG-v2-6.7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/microsoft_TNLG-v2-6.7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.309,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/mistral-ai/Mistral-v0.1-7B/369d4026-1c0b-4e75-ad65-109dfb79978a.json b/data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json
similarity index 88%
rename from data/helm_classic/mistral-ai/Mistral-v0.1-7B/369d4026-1c0b-4e75-ad65-109dfb79978a.json
rename to data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json
index a4f716c06..1fd56a99f 100644
--- a/data/helm_classic/mistral-ai/Mistral-v0.1-7B/369d4026-1c0b-4e75-ad65-109dfb79978a.json
+++ b/data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/mistral-ai_Mistral-v0.1-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/mistralai_Mistral-v0.1-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Mistral v0.1 7B",
-    "id": "mistral-ai/Mistral-v0.1-7B",
-    "developer": "mistral-ai",
+    "id": "mistralai/Mistral-v0.1-7B",
+    "developer": "mistralai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.884,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/mosaicml/MPT-30B/cd808be0-c4e5-4656-8bd2-ac6cd3f922e1.json b/data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json
similarity index 88%
rename from data/helm_classic/mosaicml/MPT-30B/cd808be0-c4e5-4656-8bd2-ac6cd3f922e1.json
rename to data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json
index bf414b629..b0d1817b0 100644
--- a/data/helm_classic/mosaicml/MPT-30B/cd808be0-c4e5-4656-8bd2-ac6cd3f922e1.json
+++ b/data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/mosaicml_MPT-30B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/mosaicml_MPT-30B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.714,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/mosaicml/MPT-Instruct-30B/182a7373-7ea3-4f2b-b730-af16e20b9fa7.json b/data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json
similarity index 88%
rename from data/helm_classic/mosaicml/MPT-Instruct-30B/182a7373-7ea3-4f2b-b730-af16e20b9fa7.json
rename to data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json
index dd4c71e77..771c4ac02 100644
--- a/data/helm_classic/mosaicml/MPT-Instruct-30B/182a7373-7ea3-4f2b-b730-af16e20b9fa7.json
+++ b/data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/mosaicml_MPT-Instruct-30B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/mosaicml_MPT-Instruct-30B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.716,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/eleuther-ai/GPT-J-6B/8f3469ef-4b41-4452-b7be-f00059fb1920.json b/data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json
similarity index 89%
rename from data/helm_classic/eleuther-ai/GPT-J-6B/8f3469ef-4b41-4452-b7be-f00059fb1920.json
rename to data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json
index 64c16a070..20a0f0d63 100644
--- a/data/helm_classic/eleuther-ai/GPT-J-6B/8f3469ef-4b41-4452-b7be-f00059fb1920.json
+++ b/data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/eleuther-ai_GPT-J-6B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_GPT-J-6B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "GPT-J 6B",
-    "id": "eleuther-ai/GPT-J-6B",
-    "developer": "eleuther-ai",
+    "id": "openai/GPT-J-6B",
+    "developer": "openai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.273,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/eleuther-ai/GPT-NeoX-20B/82427784-0189-4aed-8e0e-42ea2435e27a.json b/data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json
similarity index 89%
rename from data/helm_classic/eleuther-ai/GPT-NeoX-20B/82427784-0189-4aed-8e0e-42ea2435e27a.json
rename to data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json
index b26d9ed28..0c00ea05c 100644
--- a/data/helm_classic/eleuther-ai/GPT-NeoX-20B/82427784-0189-4aed-8e0e-42ea2435e27a.json
+++ b/data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/eleuther-ai_GPT-NeoX-20B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_GPT-NeoX-20B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "GPT-NeoX 20B",
-    "id": "eleuther-ai/GPT-NeoX-20B",
-    "developer": "eleuther-ai",
+    "id": "openai/GPT-NeoX-20B",
+    "developer": "openai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.351,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/ada-350M/f7ef6c05-4d3c-475f-9217-fb3afa9cb752.json b/data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json
similarity index 93%
rename from data/helm_classic/openai/ada-350M/f7ef6c05-4d3c-475f-9217-fb3afa9cb752.json
rename to data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json
index ee84122f5..5355ce78b 100644
--- a/data/helm_classic/openai/ada-350M/f7ef6c05-4d3c-475f-9217-fb3afa9cb752.json
+++ b/data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_ada-350M/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_ada-350M/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.108,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/babbage-1.3B/1c4a54f3-4599-441b-8f30-5e275a0597a7.json b/data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json
similarity index 93%
rename from data/helm_classic/openai/babbage-1.3B/1c4a54f3-4599-441b-8f30-5e275a0597a7.json
rename to data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json
index 3a55a8db1..d3977fc36 100644
--- a/data/helm_classic/openai/babbage-1.3B/1c4a54f3-4599-441b-8f30-5e275a0597a7.json
+++ b/data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_babbage-1.3B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_babbage-1.3B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.114,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/curie-6.7B/dbefbdbd-b64e-40e9-b632-0dcae3f33913.json b/data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json
similarity index 93%
rename from data/helm_classic/openai/curie-6.7B/dbefbdbd-b64e-40e9-b632-0dcae3f33913.json
rename to data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json
index d7959e7bb..fe011ca06 100644
--- a/data/helm_classic/openai/curie-6.7B/dbefbdbd-b64e-40e9-b632-0dcae3f33913.json
+++ b/data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_curie-6.7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_curie-6.7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.247,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/davinci-175B/f49bd5aa-bb27-43cf-a0f3-3aa4c7ed0b3e.json b/data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json
similarity index 93%
rename from data/helm_classic/openai/davinci-175B/f49bd5aa-bb27-43cf-a0f3-3aa4c7ed0b3e.json
rename to data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json
index 6b30fefef..b376d2873 100644
--- a/data/helm_classic/openai/davinci-175B/f49bd5aa-bb27-43cf-a0f3-3aa4c7ed0b3e.json
+++ b/data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_davinci-175B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_davinci-175B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.538,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/gpt-3.5-turbo-0301/2e918ebc-fbd6-4bbe-8604-e759cf5d4473.json b/data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json
similarity index 88%
rename from data/helm_classic/openai/gpt-3.5-turbo-0301/2e918ebc-fbd6-4bbe-8604-e759cf5d4473.json
rename to data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json
index 435cb040d..8051b9b3e 100644
--- a/data/helm_classic/openai/gpt-3.5-turbo-0301/2e918ebc-fbd6-4bbe-8604-e759cf5d4473.json
+++ b/data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0301/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0301/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.76,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/gpt-3.5-turbo-0613/826d8e72-7332-48b1-af41-537e505c9e11.json b/data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json
similarity index 88%
rename from data/helm_classic/openai/gpt-3.5-turbo-0613/826d8e72-7332-48b1-af41-537e505c9e11.json
rename to data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json
index bf7553bf6..b2682e6f7 100644
--- a/data/helm_classic/openai/gpt-3.5-turbo-0613/826d8e72-7332-48b1-af41-537e505c9e11.json
+++ b/data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0613/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0613/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.783,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/text-ada-001/c34ec087-f3a1-49f1-8ff7-79f353171c4c.json b/data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json
similarity index 93%
rename from data/helm_classic/openai/text-ada-001/c34ec087-f3a1-49f1-8ff7-79f353171c4c.json
rename to data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json
index d1a92ef67..43f728bf2 100644
--- a/data/helm_classic/openai/text-ada-001/c34ec087-f3a1-49f1-8ff7-79f353171c4c.json
+++ b/data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_text-ada-001/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_text-ada-001/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.107,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/text-babbage-001/09763c40-c365-4be9-befc-970ce1886641.json b/data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json
similarity index 93%
rename from data/helm_classic/openai/text-babbage-001/09763c40-c365-4be9-befc-970ce1886641.json
rename to data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json
index fb51f6a42..fbb4b5bb6 100644
--- a/data/helm_classic/openai/text-babbage-001/09763c40-c365-4be9-befc-970ce1886641.json
+++ b/data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_text-babbage-001/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_text-babbage-001/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.229,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/text-curie-001/4ece7c38-114a-4973-ba13-ac3821c9836f.json b/data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json
similarity index 93%
rename from data/helm_classic/openai/text-curie-001/4ece7c38-114a-4973-ba13-ac3821c9836f.json
rename to data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json
index bb4d6e7ff..4537bcc84 100644
--- a/data/helm_classic/openai/text-curie-001/4ece7c38-114a-4973-ba13-ac3821c9836f.json
+++ b/data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_text-curie-001/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_text-curie-001/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.36,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/text-davinci-002/75b2178d-8f0d-4b4c-b31c-752f0cdeb522.json b/data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json
similarity index 93%
rename from data/helm_classic/openai/text-davinci-002/75b2178d-8f0d-4b4c-b31c-752f0cdeb522.json
rename to data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json
index 4d9b820e6..0e9fa4947 100644
--- a/data/helm_classic/openai/text-davinci-002/75b2178d-8f0d-4b4c-b31c-752f0cdeb522.json
+++ b/data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_text-davinci-002/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_text-davinci-002/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.905,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/text-davinci-003/0c43aeaf-c7d3-4e00-8b84-5115a6396585.json b/data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json
similarity index 92%
rename from data/helm_classic/openai/text-davinci-003/0c43aeaf-c7d3-4e00-8b84-5115a6396585.json
rename to data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json
index 437247369..9ca831c0f 100644
--- a/data/helm_classic/openai/text-davinci-003/0c43aeaf-c7d3-4e00-8b84-5115a6396585.json
+++ b/data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_text-davinci-003/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_text-davinci-003/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.872,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/stanford/Alpaca-7B/d25691b8-37e7-42ff-b59a-8684197280f1.json b/data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json
similarity index 88%
rename from data/helm_classic/stanford/Alpaca-7B/d25691b8-37e7-42ff-b59a-8684197280f1.json
rename to data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json
index 24ce27c0b..cf2a4b297 100644
--- a/data/helm_classic/stanford/Alpaca-7B/d25691b8-37e7-42ff-b59a-8684197280f1.json
+++ b/data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/stanford_Alpaca-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/stanford_Alpaca-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.381,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/tii-uae/Falcon-40B/da3f6768-fa98-4aff-bf8a-db910edeabb2.json b/data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json
similarity index 88%
rename from data/helm_classic/tii-uae/Falcon-40B/da3f6768-fa98-4aff-bf8a-db910edeabb2.json
rename to data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json
index a08e1b6ca..97f13c6d9 100644
--- a/data/helm_classic/tii-uae/Falcon-40B/da3f6768-fa98-4aff-bf8a-db910edeabb2.json
+++ b/data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/tii-uae_Falcon-40B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/tiiuae_Falcon-40B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Falcon 40B",
-    "id": "tii-uae/Falcon-40B",
-    "developer": "tii-uae",
+    "id": "tiiuae/Falcon-40B",
+    "developer": "tiiuae",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.729,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/tii-uae/Falcon-7B/9f6dda65-e6e4-4a05-bdb5-ec91784600ff.json b/data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json
similarity index 88%
rename from data/helm_classic/tii-uae/Falcon-7B/9f6dda65-e6e4-4a05-bdb5-ec91784600ff.json
rename to data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json
index 0911bfafa..80c0ac18a 100644
--- a/data/helm_classic/tii-uae/Falcon-7B/9f6dda65-e6e4-4a05-bdb5-ec91784600ff.json
+++ b/data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/tii-uae_Falcon-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/tiiuae_Falcon-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Falcon 7B",
-    "id": "tii-uae/Falcon-7B",
-    "developer": "tii-uae",
+    "id": "tiiuae/Falcon-7B",
+    "developer": "tiiuae",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.378,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/tii-uae/Falcon-Instruct-40B/f936c641-4a7b-4d78-899f-e26256570592.json b/data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json
similarity index 88%
rename from data/helm_classic/tii-uae/Falcon-Instruct-40B/f936c641-4a7b-4d78-899f-e26256570592.json
rename to data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json
index 99345e7ef..4b7c6b681 100644
--- a/data/helm_classic/tii-uae/Falcon-Instruct-40B/f936c641-4a7b-4d78-899f-e26256570592.json
+++ b/data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/tii-uae_Falcon-Instruct-40B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-40B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Falcon-Instruct 40B",
-    "id": "tii-uae/Falcon-Instruct-40B",
-    "developer": "tii-uae",
+    "id": "tiiuae/Falcon-Instruct-40B",
+    "developer": "tiiuae",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.727,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/tii-uae/Falcon-Instruct-7B/7f04feb5-92b5-4d6c-96c6-7f66bfc88e96.json b/data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json
similarity index 88%
rename from data/helm_classic/tii-uae/Falcon-Instruct-7B/7f04feb5-92b5-4d6c-96c6-7f66bfc88e96.json
rename to data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json
index b0b75c2b1..cd7efa818 100644
--- a/data/helm_classic/tii-uae/Falcon-Instruct-7B/7f04feb5-92b5-4d6c-96c6-7f66bfc88e96.json
+++ b/data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/tii-uae_Falcon-Instruct-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Falcon-Instruct 7B",
-    "id": "tii-uae/Falcon-Instruct-7B",
-    "developer": "tii-uae",
+    "id": "tiiuae/Falcon-Instruct-7B",
+    "developer": "tiiuae",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.244,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/together/RedPajama-INCITE-Base-7B/8db87a70-babc-4776-8317-70752d3c5546.json b/data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json
similarity index 88%
rename from data/helm_classic/together/RedPajama-INCITE-Base-7B/8db87a70-babc-4776-8317-70752d3c5546.json
rename to data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json
index 66ae49567..f25c83f2e 100644
--- a/data/helm_classic/together/RedPajama-INCITE-Base-7B/8db87a70-babc-4776-8317-70752d3c5546.json
+++ b/data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.378,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/3da308fb-2403-432e-bde3-3b14af627552.json b/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json
similarity index 88%
rename from data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/3da308fb-2403-432e-bde3-3b14af627552.json
rename to data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json
index f09058f3c..d4d85552c 100644
--- a/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/3da308fb-2403-432e-bde3-3b14af627552.json
+++ b/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-v1-3B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-v1-3B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.311,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/fd8f7b08-813c-4369-bfe4-d86eacc874ea.json b/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json
similarity index 88%
rename from data/helm_classic/together/RedPajama-INCITE-Instruct-7B/fd8f7b08-813c-4369-bfe4-d86eacc874ea.json
rename to data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json
index 9ed3b7bf9..9d60f7506 100644
--- a/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/fd8f7b08-813c-4369-bfe4-d86eacc874ea.json
+++ b/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.524,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e0cf4bca-e6c6-4eb4-81b2-19c88d0ddd21.json b/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json
similarity index 88%
rename from data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e0cf4bca-e6c6-4eb4-81b2-19c88d0ddd21.json
rename to data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json
index bb56f1198..57ffafd39 100644
--- a/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e0cf4bca-e6c6-4eb4-81b2-19c88d0ddd21.json
+++ b/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-v1-3B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-v1-3B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.366,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/writer/InstructPalmyra-30B/bcf54365-b229-4abf-8ff8-59b4b46fa829.json b/data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json
similarity index 89%
rename from data/helm_classic/writer/InstructPalmyra-30B/bcf54365-b229-4abf-8ff8-59b4b46fa829.json
rename to data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json
index add4859be..fe1ab40e2 100644
--- a/data/helm_classic/writer/InstructPalmyra-30B/bcf54365-b229-4abf-8ff8-59b4b46fa829.json
+++ b/data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/writer_InstructPalmyra-30B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/writer_InstructPalmyra-30B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.568,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/yandex/YaLM-100B/eae6f2a0-c13a-471a-82e9-03f331b1dbe0.json b/data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json
similarity index 89%
rename from data/helm_classic/yandex/YaLM-100B/eae6f2a0-c13a-471a-82e9-03f331b1dbe0.json
rename to data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json
index 74662144a..61a019ad2 100644
--- a/data/helm_classic/yandex/YaLM-100B/eae6f2a0-c13a-471a-82e9-03f331b1dbe0.json
+++ b/data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/yandex_YaLM-100B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/yandex_YaLM-100B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.075,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/zhipu-ai/GLM-130B/f45719e5-3334-4e1d-8a83-f5f8292cb977.json b/data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json
similarity index 89%
rename from data/helm_classic/zhipu-ai/GLM-130B/f45719e5-3334-4e1d-8a83-f5f8292cb977.json
rename to data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json
index 2f17c575d..04bdfa490 100644
--- a/data/helm_classic/zhipu-ai/GLM-130B/f45719e5-3334-4e1d-8a83-f5f8292cb977.json
+++ b/data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/zhipu-ai_GLM-130B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/zhipu-ai_GLM-130B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.512,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,12 +77,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -142,12 +154,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -235,12 +256,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -328,12 +358,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +535,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -589,12 +637,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -657,12 +714,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -725,12 +791,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -793,12 +868,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,12 +1035,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1074,12 +1167,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1197,12 +1299,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1290,12 +1401,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1383,12 +1503,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_instruct/anthropic/claude-v1.3/c4e55239-581b-433f-82bc-68a690f59e4a.json b/data/helm_instruct/anthropic/claude-v1.3/c4e55239-581b-433f-82bc-68a690f59e4a.json
deleted file mode 100644
index 841d52f14..000000000
--- a/data/helm_instruct/anthropic/claude-v1.3/c4e55239-581b-433f-82bc-68a690f59e4a.json
+++ /dev/null
@@ -1,208 +0,0 @@
-{
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_instruct/anthropic_claude-v1.3/1768085895.632564",
-  "retrieved_timestamp": "1768085895.632564",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-  ],
-  "source_metadata": {
-    "source_name": "helm_instruct",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Anthropic Claude v1.3",
-    "id": "anthropic/claude-v1.3",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.611,
-        "details": {
-          "description": null,
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {}
-    },
-    {
-      "evaluation_name": "Anthropic RLHF dataset - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.965,
-        "details": {
-          "description": "min=4.925, mean=4.965, max=5, sum=39.72 (8)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "subset": [
-          "hh",
-          "hh",
-          "hh",
-          "hh",
-          "red_team",
-          "red_team",
-          "red_team",
-          "red_team"
-        ],
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale",
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Best ChatGPT Prompts - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.995,
-        "details": {
-          "description": "min=4.985, mean=4.995, max=5, sum=19.98 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
-        "tags": "",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Koala test dataset - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.981,
-        "details": {
-          "description": "min=4.965, mean=4.981, max=5, sum=19.925 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Open Assistant - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.975,
-        "details": {
-          "description": "min=4.935, mean=4.975, max=5, sum=19.9 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "language": "en",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Self Instruct - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.992,
-        "details": {
-          "description": "min=4.98, mean=4.992, max=5, sum=19.97 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Vicuna - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.989,
-        "details": {
-          "description": "min=4.956, mean=4.989, max=5, sum=19.956 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "category": "all",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json b/data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json
new file mode 100644
index 000000000..31ab229b7
--- /dev/null
+++ b/data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json
@@ -0,0 +1,267 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_instruct/anthropic_claude-v1.3/1770834858.3559701",
+  "retrieved_timestamp": "1770834858.3559701",
+  "source_metadata": {
+    "source_name": "helm_instruct",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "Anthropic Claude v1.3",
+    "id": "anthropic/claude-v1.3",
+    "developer": "anthropic",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_instruct",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "How many models this model outperform on average (over columns).",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.611,
+        "details": {
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "Anthropic RLHF dataset",
+      "source_data": {
+        "dataset_name": "Anthropic RLHF dataset",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Anthropic RLHF dataset",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.965,
+        "details": {
+          "description": "min=4.925, mean=4.965, max=5, sum=39.72 (8)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": [
+            "hh",
+            "hh",
+            "hh",
+            "hh",
+            "red_team",
+            "red_team",
+            "red_team",
+            "red_team"
+          ],
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale",
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Best ChatGPT Prompts",
+      "source_data": {
+        "dataset_name": "Best ChatGPT Prompts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Best ChatGPT Prompts",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.995,
+        "details": {
+          "description": "min=4.985, mean=4.995, max=5, sum=19.98 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
+          "tags": "",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Koala test dataset",
+      "source_data": {
+        "dataset_name": "Koala test dataset",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Koala test dataset",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.981,
+        "details": {
+          "description": "min=4.965, mean=4.981, max=5, sum=19.925 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Open Assistant",
+      "source_data": {
+        "dataset_name": "Open Assistant",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Open Assistant",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.975,
+        "details": {
+          "description": "min=4.935, mean=4.975, max=5, sum=19.9 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "language": "en",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Self Instruct",
+      "source_data": {
+        "dataset_name": "Self Instruct",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Self Instruct",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.992,
+        "details": {
+          "description": "min=4.98, mean=4.992, max=5, sum=19.97 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Vicuna",
+      "source_data": {
+        "dataset_name": "Vicuna",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Vicuna",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.989,
+        "details": {
+          "description": "min=4.956, mean=4.989, max=5, sum=19.956 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "category": "all",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json b/data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json
new file mode 100644
index 000000000..2fd221159
--- /dev/null
+++ b/data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json
@@ -0,0 +1,267 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_instruct/cohere_command-xlarge-beta/1770834858.3559701",
+  "retrieved_timestamp": "1770834858.3559701",
+  "source_metadata": {
+    "source_name": "helm_instruct",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "Cohere Command beta 52.4B",
+    "id": "cohere/command-xlarge-beta",
+    "developer": "cohere",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_instruct",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "How many models this model outperform on average (over columns).",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.089,
+        "details": {
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "Anthropic RLHF dataset",
+      "source_data": {
+        "dataset_name": "Anthropic RLHF dataset",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Anthropic RLHF dataset",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.214,
+        "details": {
+          "description": "min=3.38, mean=4.214, max=4.92, sum=33.715 (8)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": [
+            "hh",
+            "hh",
+            "hh",
+            "hh",
+            "red_team",
+            "red_team",
+            "red_team",
+            "red_team"
+          ],
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale",
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Best ChatGPT Prompts",
+      "source_data": {
+        "dataset_name": "Best ChatGPT Prompts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Best ChatGPT Prompts",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.988,
+        "details": {
+          "description": "min=4.98, mean=4.988, max=5, sum=19.95 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
+          "tags": "",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Koala test dataset",
+      "source_data": {
+        "dataset_name": "Koala test dataset",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Koala test dataset",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.969,
+        "details": {
+          "description": "min=4.936, mean=4.969, max=5, sum=19.874 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Open Assistant",
+      "source_data": {
+        "dataset_name": "Open Assistant",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Open Assistant",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.967,
+        "details": {
+          "description": "min=4.955, mean=4.967, max=5, sum=19.87 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "language": "en",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Self Instruct",
+      "source_data": {
+        "dataset_name": "Self Instruct",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Self Instruct",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.971,
+        "details": {
+          "description": "min=4.955, mean=4.971, max=5, sum=19.885 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Vicuna",
+      "source_data": {
+        "dataset_name": "Vicuna",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Vicuna",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.995,
+        "details": {
+          "description": "min=4.981, mean=4.995, max=5, sum=19.981 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "category": "all",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_instruct/cohere/command-xlarge-beta/8a68cccf-2965-4867-b922-460cc5b695de.json b/data/helm_instruct/cohere/command-xlarge-beta/8a68cccf-2965-4867-b922-460cc5b695de.json
deleted file mode 100644
index 0905e2f21..000000000
--- a/data/helm_instruct/cohere/command-xlarge-beta/8a68cccf-2965-4867-b922-460cc5b695de.json
+++ /dev/null
@@ -1,208 +0,0 @@
-{
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_instruct/cohere_command-xlarge-beta/1768085895.632564",
-  "retrieved_timestamp": "1768085895.632564",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-  ],
-  "source_metadata": {
-    "source_name": "helm_instruct",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cohere Command beta (52.4B)",
-    "id": "cohere/command-xlarge-beta",
-    "developer": "cohere",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.089,
-        "details": {
-          "description": null,
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {}
-    },
-    {
-      "evaluation_name": "Anthropic RLHF dataset - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.214,
-        "details": {
-          "description": "min=3.38, mean=4.214, max=4.92, sum=33.715 (8)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "subset": [
-          "hh",
-          "hh",
-          "hh",
-          "hh",
-          "red_team",
-          "red_team",
-          "red_team",
-          "red_team"
-        ],
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale",
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Best ChatGPT Prompts - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.988,
-        "details": {
-          "description": "min=4.98, mean=4.988, max=5, sum=19.95 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
-        "tags": "",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Koala test dataset - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.969,
-        "details": {
-          "description": "min=4.936, mean=4.969, max=5, sum=19.874 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Open Assistant - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.967,
-        "details": {
-          "description": "min=4.955, mean=4.967, max=5, sum=19.87 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "language": "en",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Self Instruct - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.971,
-        "details": {
-          "description": "min=4.955, mean=4.971, max=5, sum=19.885 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Vicuna - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.995,
-        "details": {
-          "description": "min=4.981, mean=4.995, max=5, sum=19.981 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "category": "all",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json b/data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json
new file mode 100644
index 000000000..23dfc4397
--- /dev/null
+++ b/data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json
@@ -0,0 +1,267 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1770834858.3559701",
+  "retrieved_timestamp": "1770834858.3559701",
+  "source_metadata": {
+    "source_name": "helm_instruct",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "GPT-3.5 Turbo 0613",
+    "id": "openai/gpt-3.5-turbo-0613",
+    "developer": "openai",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_instruct",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "How many models this model outperform on average (over columns).",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.689,
+        "details": {
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "Anthropic RLHF dataset",
+      "source_data": {
+        "dataset_name": "Anthropic RLHF dataset",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Anthropic RLHF dataset",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.964,
+        "details": {
+          "description": "min=4.915, mean=4.964, max=5, sum=39.715 (8)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": [
+            "hh",
+            "hh",
+            "hh",
+            "hh",
+            "red_team",
+            "red_team",
+            "red_team",
+            "red_team"
+          ],
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale",
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Best ChatGPT Prompts",
+      "source_data": {
+        "dataset_name": "Best ChatGPT Prompts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Best ChatGPT Prompts",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.986,
+        "details": {
+          "description": "min=4.95, mean=4.986, max=5, sum=19.945 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
+          "tags": "",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Koala test dataset",
+      "source_data": {
+        "dataset_name": "Koala test dataset",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Koala test dataset",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.987,
+        "details": {
+          "description": "min=4.969, mean=4.987, max=5, sum=19.95 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Open Assistant",
+      "source_data": {
+        "dataset_name": "Open Assistant",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Open Assistant",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.987,
+        "details": {
+          "description": "min=4.96, mean=4.987, max=5, sum=19.95 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "language": "en",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Self Instruct",
+      "source_data": {
+        "dataset_name": "Self Instruct",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Self Instruct",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.99,
+        "details": {
+          "description": "min=4.97, mean=4.99, max=5, sum=19.96 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Vicuna",
+      "source_data": {
+        "dataset_name": "Vicuna",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Vicuna",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.992,
+        "details": {
+          "description": "min=4.975, mean=4.992, max=5, sum=19.969 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "category": "all",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_instruct/openai/gpt-3.5-turbo-0613/a6cf2367-3615-421e-9bb6-a0c3f1d5f1ed.json b/data/helm_instruct/openai/gpt-3.5-turbo-0613/a6cf2367-3615-421e-9bb6-a0c3f1d5f1ed.json
deleted file mode 100644
index 4dc9e1ef5..000000000
--- a/data/helm_instruct/openai/gpt-3.5-turbo-0613/a6cf2367-3615-421e-9bb6-a0c3f1d5f1ed.json
+++ /dev/null
@@ -1,208 +0,0 @@
-{
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1768085895.632564",
-  "retrieved_timestamp": "1768085895.632564",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-  ],
-  "source_metadata": {
-    "source_name": "helm_instruct",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-3.5 Turbo (0613)",
-    "id": "openai/gpt-3.5-turbo-0613",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.689,
-        "details": {
-          "description": null,
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {}
-    },
-    {
-      "evaluation_name": "Anthropic RLHF dataset - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.964,
-        "details": {
-          "description": "min=4.915, mean=4.964, max=5, sum=39.715 (8)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "subset": [
-          "hh",
-          "hh",
-          "hh",
-          "hh",
-          "red_team",
-          "red_team",
-          "red_team",
-          "red_team"
-        ],
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale",
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Best ChatGPT Prompts - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.986,
-        "details": {
-          "description": "min=4.95, mean=4.986, max=5, sum=19.945 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
-        "tags": "",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Koala test dataset - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.987,
-        "details": {
-          "description": "min=4.969, mean=4.987, max=5, sum=19.95 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Open Assistant - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.987,
-        "details": {
-          "description": "min=4.96, mean=4.987, max=5, sum=19.95 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "language": "en",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Self Instruct - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.99,
-        "details": {
-          "description": "min=4.97, mean=4.99, max=5, sum=19.96 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Vicuna - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.992,
-        "details": {
-          "description": "min=4.975, mean=4.992, max=5, sum=19.969 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "category": "all",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json b/data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json
new file mode 100644
index 000000000..9ad1bca2e
--- /dev/null
+++ b/data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json
@@ -0,0 +1,267 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_instruct/openai_gpt-4-0314/1770834858.3559701",
+  "retrieved_timestamp": "1770834858.3559701",
+  "source_metadata": {
+    "source_name": "helm_instruct",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "GPT-4 0314",
+    "id": "openai/gpt-4-0314",
+    "developer": "openai",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_instruct",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "How many models this model outperform on average (over columns).",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.611,
+        "details": {
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "Anthropic RLHF dataset",
+      "source_data": {
+        "dataset_name": "Anthropic RLHF dataset",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Anthropic RLHF dataset",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.934,
+        "details": {
+          "description": "min=4.83, mean=4.934, max=5, sum=39.47 (8)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": [
+            "hh",
+            "hh",
+            "hh",
+            "hh",
+            "red_team",
+            "red_team",
+            "red_team",
+            "red_team"
+          ],
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale",
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Best ChatGPT Prompts",
+      "source_data": {
+        "dataset_name": "Best ChatGPT Prompts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Best ChatGPT Prompts",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.973,
+        "details": {
+          "description": "min=4.915, mean=4.973, max=5, sum=19.894 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
+          "tags": "",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Koala test dataset",
+      "source_data": {
+        "dataset_name": "Koala test dataset",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Koala test dataset",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.966,
+        "details": {
+          "description": "min=4.913, mean=4.966, max=5, sum=19.863 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Open Assistant",
+      "source_data": {
+        "dataset_name": "Open Assistant",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Open Assistant",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.986,
+        "details": {
+          "description": "min=4.97, mean=4.986, max=5, sum=19.945 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "language": "en",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Self Instruct",
+      "source_data": {
+        "dataset_name": "Self Instruct",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Self Instruct",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.976,
+        "details": {
+          "description": "min=4.945, mean=4.976, max=5, sum=19.905 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    },
+    {
+      "evaluation_name": "Vicuna",
+      "source_data": {
+        "dataset_name": "Vicuna",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Harmlessness on Vicuna",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 5.0
+      },
+      "score_details": {
+        "score": 4.995,
+        "details": {
+          "description": "min=4.981, mean=4.995, max=5, sum=19.981 (4)",
+          "tab": "Instruction Following"
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "category": "all",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_instruct/openai/gpt-4-0314/d4833e0d-b2ca-4161-a503-f5d4d9545bb0.json b/data/helm_instruct/openai/gpt-4-0314/d4833e0d-b2ca-4161-a503-f5d4d9545bb0.json
deleted file mode 100644
index f76268b07..000000000
--- a/data/helm_instruct/openai/gpt-4-0314/d4833e0d-b2ca-4161-a503-f5d4d9545bb0.json
+++ /dev/null
@@ -1,208 +0,0 @@
-{
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_instruct/openai_gpt-4-0314/1768085895.632564",
-  "retrieved_timestamp": "1768085895.632564",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-  ],
-  "source_metadata": {
-    "source_name": "helm_instruct",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4 (0314)",
-    "id": "openai/gpt-4-0314",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.611,
-        "details": {
-          "description": null,
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {}
-    },
-    {
-      "evaluation_name": "Anthropic RLHF dataset - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.934,
-        "details": {
-          "description": "min=4.83, mean=4.934, max=5, sum=39.47 (8)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "subset": [
-          "hh",
-          "hh",
-          "hh",
-          "hh",
-          "red_team",
-          "red_team",
-          "red_team",
-          "red_team"
-        ],
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale",
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Best ChatGPT Prompts - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.973,
-        "details": {
-          "description": "min=4.915, mean=4.973, max=5, sum=19.894 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
-        "tags": "",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Koala test dataset - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.966,
-        "details": {
-          "description": "min=4.913, mean=4.966, max=5, sum=19.863 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Open Assistant - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.986,
-        "details": {
-          "description": "min=4.97, mean=4.986, max=5, sum=19.945 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "language": "en",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Self Instruct - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.976,
-        "details": {
-          "description": "min=4.945, mean=4.976, max=5, sum=19.905 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Vicuna - Harmlessness",
-      "metric_config": {
-        "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.995,
-        "details": {
-          "description": "min=4.981, mean=4.995, max=5, sum=19.981 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "category": "all",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/01-ai/yi-34b/3b8567cf-40f0-4d63-ad12-9b1712a2c503.json b/data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json
similarity index 76%
rename from data/helm_lite/01-ai/yi-34b/3b8567cf-40f0-4d63-ad12-9b1712a2c503.json
rename to data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json
index 8d9b0c6e7..946b7db3e 100644
--- a/data/helm_lite/01-ai/yi-34b/3b8567cf-40f0-4d63-ad12-9b1712a2c503.json
+++ b/data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/01-ai_yi-34b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/01-ai_yi-34b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Yi (34B)",
+    "name": "Yi 34B",
     "id": "01-ai/yi-34b",
     "developer": "01-ai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.57,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/01-ai/yi-6b/3b94c757-b54d-462c-a2a1-d331711a0833.json b/data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json
similarity index 76%
rename from data/helm_lite/01-ai/yi-6b/3b94c757-b54d-462c-a2a1-d331711a0833.json
rename to data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json
index 04e690e09..28ba5fb69 100644
--- a/data/helm_lite/01-ai/yi-6b/3b94c757-b54d-462c-a2a1-d331711a0833.json
+++ b/data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/01-ai_yi-6b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/01-ai_yi-6b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Yi (6B)",
+    "name": "Yi 6B",
     "id": "01-ai/yi-6b",
     "developer": "01-ai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.253,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/01-ai/yi-large-preview/3d0d4d91-1f1a-4cca-b837-878faa03e7e6.json b/data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json
similarity index 76%
rename from data/helm_lite/01-ai/yi-large-preview/3d0d4d91-1f1a-4cca-b837-878faa03e7e6.json
rename to data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json
index 6d66d647a..9fe678bb4 100644
--- a/data/helm_lite/01-ai/yi-large-preview/3d0d4d91-1f1a-4cca-b837-878faa03e7e6.json
+++ b/data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/01-ai_yi-large-preview/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/01-ai_yi-large-preview/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Yi Large (Preview)",
+    "name": "Yi Large Preview",
     "id": "01-ai/yi-large-preview",
     "developer": "01-ai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.471,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/AlephAlpha/luminous-base/b4fa23d2-48cd-4a58-b70d-25b466781008.json b/data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json
similarity index 76%
rename from data/helm_lite/AlephAlpha/luminous-base/b4fa23d2-48cd-4a58-b70d-25b466781008.json
rename to data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json
index 4d89d0b52..fb405652b 100644
--- a/data/helm_lite/AlephAlpha/luminous-base/b4fa23d2-48cd-4a58-b70d-25b466781008.json
+++ b/data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/AlephAlpha_luminous-base/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/AlephAlpha_luminous-base/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Luminous Base (13B)",
+    "name": "Luminous Base 13B",
     "id": "AlephAlpha/luminous-base",
     "developer": "AlephAlpha",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.041,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/AlephAlpha/luminous-extended/818cfaa1-815b-4a13-b017-5e6c30ed9de3.json b/data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json
similarity index 76%
rename from data/helm_lite/AlephAlpha/luminous-extended/818cfaa1-815b-4a13-b017-5e6c30ed9de3.json
rename to data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json
index 74581377a..786a7e340 100644
--- a/data/helm_lite/AlephAlpha/luminous-extended/818cfaa1-815b-4a13-b017-5e6c30ed9de3.json
+++ b/data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/AlephAlpha_luminous-extended/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/AlephAlpha_luminous-extended/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Luminous Extended (30B)",
+    "name": "Luminous Extended 30B",
     "id": "AlephAlpha/luminous-extended",
     "developer": "AlephAlpha",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.078,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/AlephAlpha/luminous-supreme/62727554-ab2c-4218-9c3c-3eba48420834.json b/data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json
similarity index 76%
rename from data/helm_lite/AlephAlpha/luminous-supreme/62727554-ab2c-4218-9c3c-3eba48420834.json
rename to data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json
index 9f7e37eaf..78da47969 100644
--- a/data/helm_lite/AlephAlpha/luminous-supreme/62727554-ab2c-4218-9c3c-3eba48420834.json
+++ b/data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/AlephAlpha_luminous-supreme/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/AlephAlpha_luminous-supreme/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Luminous Supreme (70B)",
+    "name": "Luminous Supreme 70B",
     "id": "AlephAlpha/luminous-supreme",
     "developer": "AlephAlpha",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.145,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/ai21/j2-grande/c58c4299-ede8-46b6-8d33-2f900c272853.json b/data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json
similarity index 76%
rename from data/helm_lite/ai21/j2-grande/c58c4299-ede8-46b6-8d33-2f900c272853.json
rename to data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json
index 9efa2b824..2b870e958 100644
--- a/data/helm_lite/ai21/j2-grande/c58c4299-ede8-46b6-8d33-2f900c272853.json
+++ b/data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/ai21_j2-grande/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/ai21_j2-grande/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Jurassic-2 Grande (17B)",
+    "name": "Jurassic-2 Grande 17B",
     "id": "ai21/j2-grande",
     "developer": "ai21",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.172,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/ai21/j2-jumbo/bcd6ffc0-3d3c-423f-9542-00246b3b1f43.json b/data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json
similarity index 76%
rename from data/helm_lite/ai21/j2-jumbo/bcd6ffc0-3d3c-423f-9542-00246b3b1f43.json
rename to data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json
index 1c64f2731..643b24001 100644
--- a/data/helm_lite/ai21/j2-jumbo/bcd6ffc0-3d3c-423f-9542-00246b3b1f43.json
+++ b/data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/ai21_j2-jumbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/ai21_j2-jumbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Jurassic-2 Jumbo (178B)",
+    "name": "Jurassic-2 Jumbo 178B",
     "id": "ai21/j2-jumbo",
     "developer": "ai21",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.215,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/ai21/jamba-1.5-large/38918b97-2707-4b53-99a8-7a67816f398c.json b/data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json
similarity index 76%
rename from data/helm_lite/ai21/jamba-1.5-large/38918b97-2707-4b53-99a8-7a67816f398c.json
rename to data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json
index 634cd87ae..a07da123a 100644
--- a/data/helm_lite/ai21/jamba-1.5-large/38918b97-2707-4b53-99a8-7a67816f398c.json
+++ b/data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/ai21_jamba-1.5-large/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/ai21_jamba-1.5-large/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.637,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/ai21/jamba-1.5-mini/82ed1b8c-74c3-48ed-9a0c-d4ce88088648.json b/data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json
similarity index 76%
rename from data/helm_lite/ai21/jamba-1.5-mini/82ed1b8c-74c3-48ed-9a0c-d4ce88088648.json
rename to data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json
index 3483b0b9a..9e0628c9d 100644
--- a/data/helm_lite/ai21/jamba-1.5-mini/82ed1b8c-74c3-48ed-9a0c-d4ce88088648.json
+++ b/data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/ai21_jamba-1.5-mini/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/ai21_jamba-1.5-mini/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.414,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/ai21/jamba-instruct/9278a23a-cecd-446c-b234-2301e1e44c40.json b/data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json
similarity index 76%
rename from data/helm_lite/ai21/jamba-instruct/9278a23a-cecd-446c-b234-2301e1e44c40.json
rename to data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json
index 527fb50a5..9e1241a8e 100644
--- a/data/helm_lite/ai21/jamba-instruct/9278a23a-cecd-446c-b234-2301e1e44c40.json
+++ b/data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/ai21_jamba-instruct/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/ai21_jamba-instruct/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.287,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,12 +628,14 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/allenai/olmo-7b/81aadbf6-7b74-4a3d-aeaa-e9d39b75fc54.json b/data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json
similarity index 76%
rename from data/helm_lite/allenai/olmo-7b/81aadbf6-7b74-4a3d-aeaa-e9d39b75fc54.json
rename to data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json
index 51634a355..b68794dd1 100644
--- a/data/helm_lite/allenai/olmo-7b/81aadbf6-7b74-4a3d-aeaa-e9d39b75fc54.json
+++ b/data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/allenai_olmo-7b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/allenai_olmo-7b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "OLMo (7B)",
+    "name": "OLMo 7B",
     "id": "allenai/olmo-7b",
     "developer": "allenai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.052,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/amazon/nova-lite-v1:0/034168e5-90a0-4816-a9fb-1c2f5e733811.json b/data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json
similarity index 76%
rename from data/helm_lite/amazon/nova-lite-v1:0/034168e5-90a0-4816-a9fb-1c2f5e733811.json
rename to data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json
index e8381a3f3..084734ba7 100644
--- a/data/helm_lite/amazon/nova-lite-v1:0/034168e5-90a0-4816-a9fb-1c2f5e733811.json
+++ b/data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/amazon_nova-lite-v1:0/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/amazon_nova-lite-v1:0/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.708,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,20 +506,29 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ],
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +571,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -545,13 +629,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/amazon/nova-micro-v1:0/74d72f92-a824-4f3a-93ae-b37e16691ad9.json b/data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json
similarity index 76%
rename from data/helm_lite/amazon/nova-micro-v1:0/74d72f92-a824-4f3a-93ae-b37e16691ad9.json
rename to data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json
index 8fb5d6b37..fb66c7744 100644
--- a/data/helm_lite/amazon/nova-micro-v1:0/74d72f92-a824-4f3a-93ae-b37e16691ad9.json
+++ b/data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/amazon_nova-micro-v1:0/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/amazon_nova-micro-v1:0/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.524,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,20 +506,29 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ],
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +571,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -545,13 +629,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/amazon/nova-pro-v1:0/f91c057f-5f5d-4183-abf4-54b44e82da2b.json b/data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json
similarity index 76%
rename from data/helm_lite/amazon/nova-pro-v1:0/f91c057f-5f5d-4183-abf4-54b44e82da2b.json
rename to data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json
index 52c65584f..c7f9d86e2 100644
--- a/data/helm_lite/amazon/nova-pro-v1:0/f91c057f-5f5d-4183-abf4-54b44e82da2b.json
+++ b/data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/amazon_nova-pro-v1:0/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/amazon_nova-pro-v1:0/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.885,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,20 +506,29 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ],
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +571,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -545,13 +629,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-2.0/b2b9e87c-76de-4716-8d28-4b13a34c360f.json b/data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json
similarity index 76%
rename from data/helm_lite/anthropic/claude-2.0/b2b9e87c-76de-4716-8d28-4b13a34c360f.json
rename to data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json
index b883ce7c5..ab0989b58 100644
--- a/data/helm_lite/anthropic/claude-2.0/b2b9e87c-76de-4716-8d28-4b13a34c360f.json
+++ b/data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-2.0/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-2.0/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.489,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-2.1/0bd11df6-a037-4f55-a78a-cc23c34c0958.json b/data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json
similarity index 76%
rename from data/helm_lite/anthropic/claude-2.1/0bd11df6-a037-4f55-a78a-cc23c34c0958.json
rename to data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json
index 388a1840c..2adbb62af 100644
--- a/data/helm_lite/anthropic/claude-2.1/0bd11df6-a037-4f55-a78a-cc23c34c0958.json
+++ b/data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-2.1/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-2.1/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.437,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-3-5-haiku-20241022/f4061c6a-f82f-4642-a734-f6adb0be7519.json b/data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json
similarity index 76%
rename from data/helm_lite/anthropic/claude-3-5-haiku-20241022/f4061c6a-f82f-4642-a734-f6adb0be7519.json
rename to data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json
index 231b91f4e..ff757a7ad 100644
--- a/data/helm_lite/anthropic/claude-3-5-haiku-20241022/f4061c6a-f82f-4642-a734-f6adb0be7519.json
+++ b/data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-5-haiku-20241022/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-3-5-haiku-20241022/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.5 Haiku (20241022)",
+    "name": "Claude 3.5 Haiku 20241022",
     "id": "anthropic/claude-3-5-haiku-20241022",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.531,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,20 +506,29 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ],
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +571,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -545,13 +629,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/18de115f-32ab-4b2a-b4b2-2ff9553b12f0.json b/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json
similarity index 76%
rename from data/helm_lite/anthropic/claude-3-5-sonnet-20240620/18de115f-32ab-4b2a-b4b2-2ff9553b12f0.json
rename to data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json
index 0ee2e76e5..2c4b0d7d1 100644
--- a/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/18de115f-32ab-4b2a-b4b2-2ff9553b12f0.json
+++ b/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20240620/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20240620/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.5 Sonnet (20240620)",
+    "name": "Claude 3.5 Sonnet 20240620",
     "id": "anthropic/claude-3-5-sonnet-20240620",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.885,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/d0cd5626-5b2c-46df-b265-e130a789a0e7.json b/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json
similarity index 76%
rename from data/helm_lite/anthropic/claude-3-5-sonnet-20241022/d0cd5626-5b2c-46df-b265-e130a789a0e7.json
rename to data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json
index d816a8a2a..4b9824f13 100644
--- a/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/d0cd5626-5b2c-46df-b265-e130a789a0e7.json
+++ b/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20241022/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20241022/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.5 Sonnet (20241022)",
+    "name": "Claude 3.5 Sonnet 20241022",
     "id": "anthropic/claude-3-5-sonnet-20241022",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.846,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-3-haiku-20240307/3eea5b0f-1126-448f-94e5-52a874baa61a.json b/data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json
similarity index 76%
rename from data/helm_lite/anthropic/claude-3-haiku-20240307/3eea5b0f-1126-448f-94e5-52a874baa61a.json
rename to data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json
index 66e3c14b8..8eac62865 100644
--- a/data/helm_lite/anthropic/claude-3-haiku-20240307/3eea5b0f-1126-448f-94e5-52a874baa61a.json
+++ b/data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-haiku-20240307/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-3-haiku-20240307/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3 Haiku (20240307)",
+    "name": "Claude 3 Haiku 20240307",
     "id": "anthropic/claude-3-haiku-20240307",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.263,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-3-opus-20240229/9fa44303-4699-47f2-9777-0c118e36d87e.json b/data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json
similarity index 76%
rename from data/helm_lite/anthropic/claude-3-opus-20240229/9fa44303-4699-47f2-9777-0c118e36d87e.json
rename to data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json
index 27c9ec758..d590c786e 100644
--- a/data/helm_lite/anthropic/claude-3-opus-20240229/9fa44303-4699-47f2-9777-0c118e36d87e.json
+++ b/data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-opus-20240229/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-3-opus-20240229/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3 Opus (20240229)",
+    "name": "Claude 3 Opus 20240229",
     "id": "anthropic/claude-3-opus-20240229",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.683,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-3-sonnet-20240229/a2d019d6-52bf-439f-90f0-74583928e5c0.json b/data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json
similarity index 76%
rename from data/helm_lite/anthropic/claude-3-sonnet-20240229/a2d019d6-52bf-439f-90f0-74583928e5c0.json
rename to data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json
index 3cbea3718..90baddbf7 100644
--- a/data/helm_lite/anthropic/claude-3-sonnet-20240229/a2d019d6-52bf-439f-90f0-74583928e5c0.json
+++ b/data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-sonnet-20240229/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-3-sonnet-20240229/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3 Sonnet (20240229)",
+    "name": "Claude 3 Sonnet 20240229",
     "id": "anthropic/claude-3-sonnet-20240229",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.377,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-instant-1.2/0f884c98-ea5e-4409-81e2-40aa5c84f99d.json b/data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json
similarity index 76%
rename from data/helm_lite/anthropic/claude-instant-1.2/0f884c98-ea5e-4409-81e2-40aa5c84f99d.json
rename to data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json
index a1592f60e..c3ca60cb8 100644
--- a/data/helm_lite/anthropic/claude-instant-1.2/0f884c98-ea5e-4409-81e2-40aa5c84f99d.json
+++ b/data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-instant-1.2/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-instant-1.2/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.399,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-v1.3/2e1efde7-6f64-40b8-86ce-8cc29c6a78bf.json b/data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json
similarity index 76%
rename from data/helm_lite/anthropic/claude-v1.3/2e1efde7-6f64-40b8-86ce-8cc29c6a78bf.json
rename to data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json
index e73713e6a..da3e6b3b3 100644
--- a/data/helm_lite/anthropic/claude-v1.3/2e1efde7-6f64-40b8-86ce-8cc29c6a78bf.json
+++ b/data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-v1.3/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-v1.3/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.518,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/cohere/command-light/8c312031-5da7-4816-8207-056fe1bc161d.json b/data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json
similarity index 76%
rename from data/helm_lite/cohere/command-light/8c312031-5da7-4816-8207-056fe1bc161d.json
rename to data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json
index aabe52512..a431f3338 100644
--- a/data/helm_lite/cohere/command-light/8c312031-5da7-4816-8207-056fe1bc161d.json
+++ b/data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/cohere_command-light/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/cohere_command-light/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.105,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/cohere/command-r-plus/71c0558f-7b56-40ea-a1be-2749b88758c7.json b/data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json
similarity index 76%
rename from data/helm_lite/cohere/command-r-plus/71c0558f-7b56-40ea-a1be-2749b88758c7.json
rename to data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json
index 288bdd798..d0f464767 100644
--- a/data/helm_lite/cohere/command-r-plus/71c0558f-7b56-40ea-a1be-2749b88758c7.json
+++ b/data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/cohere_command-r-plus/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/cohere_command-r-plus/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.441,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/cohere/command-r/d1330068-2c16-450e-8ce5-1d05f5e842d9.json b/data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json
similarity index 76%
rename from data/helm_lite/cohere/command-r/d1330068-2c16-450e-8ce5-1d05f5e842d9.json
rename to data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json
index 33b212443..51821d155 100644
--- a/data/helm_lite/cohere/command-r/d1330068-2c16-450e-8ce5-1d05f5e842d9.json
+++ b/data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/cohere_command-r/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/cohere_command-r/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.299,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/cohere/command/dec04718-1ae9-4e4b-92da-01d789424f69.json b/data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json
similarity index 76%
rename from data/helm_lite/cohere/command/dec04718-1ae9-4e4b-92da-01d789424f69.json
rename to data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json
index b95f59ea4..488fa54b9 100644
--- a/data/helm_lite/cohere/command/dec04718-1ae9-4e4b-92da-01d789424f69.json
+++ b/data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/cohere_command/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/cohere_command/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.327,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/databricks/dbrx-instruct/ba50499a-6cfd-4f04-aab5-c2122202cc74.json b/data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json
similarity index 76%
rename from data/helm_lite/databricks/dbrx-instruct/ba50499a-6cfd-4f04-aab5-c2122202cc74.json
rename to data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json
index 7cf9a9388..9dc0aa32d 100644
--- a/data/helm_lite/databricks/dbrx-instruct/ba50499a-6cfd-4f04-aab5-c2122202cc74.json
+++ b/data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/databricks_dbrx-instruct/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/databricks_dbrx-instruct/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.289,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/35bf65f3-d585-4fb9-8c9d-6b1e1dccb569.json b/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json
similarity index 76%
rename from data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/35bf65f3-d585-4fb9-8c9d-6b1e1dccb569.json
rename to data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json
index bf2730468..201ddf6e5 100644
--- a/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/35bf65f3-d585-4fb9-8c9d-6b1e1dccb569.json
+++ b/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/deepseek-ai_deepseek-llm-67b-chat/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/deepseek-ai_deepseek-llm-67b-chat/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "DeepSeek LLM Chat (67B)",
+    "name": "DeepSeek LLM Chat 67B",
     "id": "deepseek-ai/deepseek-llm-67b-chat",
     "developer": "deepseek-ai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.488,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/deepseek-ai/deepseek-v3/d11c2c6d-b5d0-4c40-bd8e-d6bd194aadf5.json b/data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json
similarity index 76%
rename from data/helm_lite/deepseek-ai/deepseek-v3/d11c2c6d-b5d0-4c40-bd8e-d6bd194aadf5.json
rename to data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json
index e07480be1..b5f8e240f 100644
--- a/data/helm_lite/deepseek-ai/deepseek-v3/d11c2c6d-b5d0-4c40-bd8e-d6bd194aadf5.json
+++ b/data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/deepseek-ai_deepseek-v3/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/deepseek-ai_deepseek-v3/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.908,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemini-1.0-pro-002/1e98157d-49e6-4d66-ae21-a95d419c47e3.json b/data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json
similarity index 76%
rename from data/helm_lite/google/gemini-1.0-pro-002/1e98157d-49e6-4d66-ae21-a95d419c47e3.json
rename to data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json
index eefe2f954..eabdc0bbd 100644
--- a/data/helm_lite/google/gemini-1.0-pro-002/1e98157d-49e6-4d66-ae21-a95d419c47e3.json
+++ b/data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemini-1.0-pro-002/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemini-1.0-pro-002/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.0 Pro (002)",
+    "name": "Gemini 1.0 Pro 002",
     "id": "google/gemini-1.0-pro-002",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.422,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemini-1.5-flash-001/e92bce18-690a-44eb-8bc5-28e9303473bb.json b/data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json
similarity index 76%
rename from data/helm_lite/google/gemini-1.5-flash-001/e92bce18-690a-44eb-8bc5-28e9303473bb.json
rename to data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json
index e10645540..991b81669 100644
--- a/data/helm_lite/google/gemini-1.5-flash-001/e92bce18-690a-44eb-8bc5-28e9303473bb.json
+++ b/data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemini-1.5-flash-001/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemini-1.5-flash-001/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Flash (001)",
+    "name": "Gemini 1.5 Flash 001",
     "id": "google/gemini-1.5-flash-001",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.667,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemini-1.5-flash-002/3a54f656-78bd-4fbb-97c5-ae12ed6f888c.json b/data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json
similarity index 76%
rename from data/helm_lite/google/gemini-1.5-flash-002/3a54f656-78bd-4fbb-97c5-ae12ed6f888c.json
rename to data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json
index 8e4eb067b..725c639a2 100644
--- a/data/helm_lite/google/gemini-1.5-flash-002/3a54f656-78bd-4fbb-97c5-ae12ed6f888c.json
+++ b/data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemini-1.5-flash-002/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemini-1.5-flash-002/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Flash (002)",
+    "name": "Gemini 1.5 Flash 002",
     "id": "google/gemini-1.5-flash-002",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.573,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemini-1.5-pro-001/b1ecfc78-f59e-437f-b163-9253ad092799.json b/data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json
similarity index 76%
rename from data/helm_lite/google/gemini-1.5-pro-001/b1ecfc78-f59e-437f-b163-9253ad092799.json
rename to data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json
index 38c3a236a..8b7eab026 100644
--- a/data/helm_lite/google/gemini-1.5-pro-001/b1ecfc78-f59e-437f-b163-9253ad092799.json
+++ b/data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemini-1.5-pro-001/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemini-1.5-pro-001/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Pro (001)",
+    "name": "Gemini 1.5 Pro 001",
     "id": "google/gemini-1.5-pro-001",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.739,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemini-1.5-pro-002/04415dda-306f-420c-8af8-54336368fc40.json b/data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json
similarity index 76%
rename from data/helm_lite/google/gemini-1.5-pro-002/04415dda-306f-420c-8af8-54336368fc40.json
rename to data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json
index cada735aa..ebd3081fb 100644
--- a/data/helm_lite/google/gemini-1.5-pro-002/04415dda-306f-420c-8af8-54336368fc40.json
+++ b/data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemini-1.5-pro-002/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemini-1.5-pro-002/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Pro (002)",
+    "name": "Gemini 1.5 Pro 002",
     "id": "google/gemini-1.5-pro-002",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.842,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemini-2.0-flash-exp/ef8afc84-3f35-4d93-ab2e-0f07f25b9dde.json b/data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json
similarity index 76%
rename from data/helm_lite/google/gemini-2.0-flash-exp/ef8afc84-3f35-4d93-ab2e-0f07f25b9dde.json
rename to data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json
index 1487ce304..b96b71c0c 100644
--- a/data/helm_lite/google/gemini-2.0-flash-exp/ef8afc84-3f35-4d93-ab2e-0f07f25b9dde.json
+++ b/data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemini-2.0-flash-exp/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemini-2.0-flash-exp/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 2.0 Flash (Experimental)",
+    "name": "Gemini 2.0 Flash Experimental",
     "id": "google/gemini-2.0-flash-exp",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.813,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,20 +506,29 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ],
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +571,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -545,13 +629,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemma-2-27b-it/5eb1e8ba-361a-4b37-b865-7ae6f7ccde80.json b/data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json
similarity index 76%
rename from data/helm_lite/google/gemma-2-27b-it/5eb1e8ba-361a-4b37-b865-7ae6f7ccde80.json
rename to data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json
index 29456a114..ea107cc9e 100644
--- a/data/helm_lite/google/gemma-2-27b-it/5eb1e8ba-361a-4b37-b865-7ae6f7ccde80.json
+++ b/data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemma-2-27b-it/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemma-2-27b-it/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemma 2 Instruct (27B)",
+    "name": "Gemma 2 Instruct 27B",
     "id": "google/gemma-2-27b-it",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.675,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemma-2-9b-it/63af45df-c46d-46df-8f3e-592181ce6a7a.json b/data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json
similarity index 76%
rename from data/helm_lite/google/gemma-2-9b-it/63af45df-c46d-46df-8f3e-592181ce6a7a.json
rename to data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json
index 75457f70d..1488d6604 100644
--- a/data/helm_lite/google/gemma-2-9b-it/63af45df-c46d-46df-8f3e-592181ce6a7a.json
+++ b/data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemma-2-9b-it/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemma-2-9b-it/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemma 2 Instruct (9B)",
+    "name": "Gemma 2 Instruct 9B",
     "id": "google/gemma-2-9b-it",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.562,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemma-7b/aad88f1f-6047-45e7-8b0f-d5deac20be68.json b/data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json
similarity index 76%
rename from data/helm_lite/google/gemma-7b/aad88f1f-6047-45e7-8b0f-d5deac20be68.json
rename to data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json
index dabc86d10..810e32965 100644
--- a/data/helm_lite/google/gemma-7b/aad88f1f-6047-45e7-8b0f-d5deac20be68.json
+++ b/data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemma-7b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemma-7b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemma (7B)",
+    "name": "Gemma 7B",
     "id": "google/gemma-7b",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.336,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/text-bison@001/f0bde02f-bde8-40c5-abc5-9cb4a25a55ce.json b/data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json
similarity index 76%
rename from data/helm_lite/google/text-bison@001/f0bde02f-bde8-40c5-abc5-9cb4a25a55ce.json
rename to data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json
index 9c9727ed0..30d0e3442 100644
--- a/data/helm_lite/google/text-bison@001/f0bde02f-bde8-40c5-abc5-9cb4a25a55ce.json
+++ b/data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_text-bison@001/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_text-bison@001/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "PaLM-2 (Bison)",
+    "name": "PaLM-2 Bison",
     "id": "google/text-bison@001",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.526,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/text-unicorn@001/35f70e20-8a08-4f7c-b822-5238337d4177.json b/data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json
similarity index 76%
rename from data/helm_lite/google/text-unicorn@001/35f70e20-8a08-4f7c-b822-5238337d4177.json
rename to data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json
index 2e152e4a7..d5841340f 100644
--- a/data/helm_lite/google/text-unicorn@001/35f70e20-8a08-4f7c-b822-5238337d4177.json
+++ b/data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_text-unicorn@001/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_text-unicorn@001/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "PaLM-2 (Unicorn)",
+    "name": "PaLM-2 Unicorn",
     "id": "google/text-unicorn@001",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.644,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-2-13b/e19c56fc-5f6c-48a0-874a-97665283e6f0.json b/data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json
similarity index 76%
rename from data/helm_lite/meta/llama-2-13b/e19c56fc-5f6c-48a0-874a-97665283e6f0.json
rename to data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json
index a5b394c06..079c14180 100644
--- a/data/helm_lite/meta/llama-2-13b/e19c56fc-5f6c-48a0-874a-97665283e6f0.json
+++ b/data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-2-13b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-2-13b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 2 (13B)",
+    "name": "Llama 2 13B",
     "id": "meta/llama-2-13b",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.233,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-2-70b/98a0c9bb-9679-4cc5-85b8-8801dbb965de.json b/data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json
similarity index 76%
rename from data/helm_lite/meta/llama-2-70b/98a0c9bb-9679-4cc5-85b8-8801dbb965de.json
rename to data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json
index cf4407980..8faa07285 100644
--- a/data/helm_lite/meta/llama-2-70b/98a0c9bb-9679-4cc5-85b8-8801dbb965de.json
+++ b/data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-2-70b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-2-70b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 2 (70B)",
+    "name": "Llama 2 70B",
     "id": "meta/llama-2-70b",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.482,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-2-7b/fad21bfe-048f-412c-b3fd-9b43d276b2a2.json b/data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json
similarity index 76%
rename from data/helm_lite/meta/llama-2-7b/fad21bfe-048f-412c-b3fd-9b43d276b2a2.json
rename to data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json
index 3b18db79e..bb2c02730 100644
--- a/data/helm_lite/meta/llama-2-7b/fad21bfe-048f-412c-b3fd-9b43d276b2a2.json
+++ b/data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-2-7b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-2-7b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 2 (7B)",
+    "name": "Llama 2 7B",
     "id": "meta/llama-2-7b",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.152,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-3-70b/b1e28406-d88d-4acd-a268-7baebc9b565a.json b/data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json
similarity index 76%
rename from data/helm_lite/meta/llama-3-70b/b1e28406-d88d-4acd-a268-7baebc9b565a.json
rename to data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json
index 90d04801d..876850010 100644
--- a/data/helm_lite/meta/llama-3-70b/b1e28406-d88d-4acd-a268-7baebc9b565a.json
+++ b/data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-3-70b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-3-70b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3 (70B)",
+    "name": "Llama 3 70B",
     "id": "meta/llama-3-70b",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.793,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-3-8b/60696eaf-669d-49bf-bebe-6cd171522faa.json b/data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json
similarity index 76%
rename from data/helm_lite/meta/llama-3-8b/60696eaf-669d-49bf-bebe-6cd171522faa.json
rename to data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json
index 0e3ff704d..87ab72524 100644
--- a/data/helm_lite/meta/llama-3-8b/60696eaf-669d-49bf-bebe-6cd171522faa.json
+++ b/data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-3-8b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-3-8b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3 (8B)",
+    "name": "Llama 3 8B",
     "id": "meta/llama-3-8b",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.387,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/ad2fdc9f-20fd-4ad6-8cea-0380c297b725.json b/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json
similarity index 76%
rename from data/helm_lite/meta/llama-3.1-405b-instruct-turbo/ad2fdc9f-20fd-4ad6-8cea-0380c297b725.json
rename to data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json
index 8311edd73..0bc6225d5 100644
--- a/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/ad2fdc9f-20fd-4ad6-8cea-0380c297b725.json
+++ b/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-3.1-405b-instruct-turbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-3.1-405b-instruct-turbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (405B)",
+    "name": "Llama 3.1 Instruct Turbo 405B",
     "id": "meta/llama-3.1-405b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.854,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/c3b72d96-9af5-4e32-b420-e85a88e82e5a.json b/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json
similarity index 76%
rename from data/helm_lite/meta/llama-3.1-70b-instruct-turbo/c3b72d96-9af5-4e32-b420-e85a88e82e5a.json
rename to data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json
index 3e59bea75..d57074cb2 100644
--- a/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/c3b72d96-9af5-4e32-b420-e85a88e82e5a.json
+++ b/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-3.1-70b-instruct-turbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-3.1-70b-instruct-turbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (70B)",
+    "name": "Llama 3.1 Instruct Turbo 70B",
     "id": "meta/llama-3.1-70b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.808,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/57b2177d-0232-41ca-aa3a-b2ecb7af7586.json b/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json
similarity index 76%
rename from data/helm_lite/meta/llama-3.1-8b-instruct-turbo/57b2177d-0232-41ca-aa3a-b2ecb7af7586.json
rename to data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json
index 300f5dbb2..198d81cd2 100644
--- a/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/57b2177d-0232-41ca-aa3a-b2ecb7af7586.json
+++ b/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (8B)",
+    "name": "Llama 3.1 Instruct Turbo 8B",
     "id": "meta/llama-3.1-8b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.303,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/6ed32ce2-18e5-4d1b-94f8-443f81892275.json b/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json
similarity index 76%
rename from data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/6ed32ce2-18e5-4d1b-94f8-443f81892275.json
rename to data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json
index 4daa7f500..722a6f050 100644
--- a/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/6ed32ce2-18e5-4d1b-94f8-443f81892275.json
+++ b/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-3.2-11b-vision-instruct-turbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-3.2-11b-vision-instruct-turbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.2 Vision Instruct Turbo (11B)",
+    "name": "Llama 3.2 Vision Instruct Turbo 11B",
     "id": "meta/llama-3.2-11b-vision-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.325,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/5c11f938-7933-45ae-8530-05dac1012f10.json b/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json
similarity index 76%
rename from data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/5c11f938-7933-45ae-8530-05dac1012f10.json
rename to data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json
index 17f50b1c8..8bef7c4e9 100644
--- a/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/5c11f938-7933-45ae-8530-05dac1012f10.json
+++ b/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.2 Vision Instruct Turbo (90B)",
+    "name": "Llama 3.2 Vision Instruct Turbo 90B",
     "id": "meta/llama-3.2-90b-vision-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.819,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/2b9e00e5-15e1-45ea-a345-32a3d84460fb.json b/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json
similarity index 76%
rename from data/helm_lite/meta/llama-3.3-70b-instruct-turbo/2b9e00e5-15e1-45ea-a345-32a3d84460fb.json
rename to data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json
index 06851628a..cc4cca983 100644
--- a/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/2b9e00e5-15e1-45ea-a345-32a3d84460fb.json
+++ b/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-3.3-70b-instruct-turbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-3.3-70b-instruct-turbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.3 Instruct Turbo (70B)",
+    "name": "Llama 3.3 Instruct Turbo 70B",
     "id": "meta/llama-3.3-70b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.812,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-65b/3e27a5c3-a752-4790-b219-5964331e40ac.json b/data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json
similarity index 76%
rename from data/helm_lite/meta/llama-65b/3e27a5c3-a752-4790-b219-5964331e40ac.json
rename to data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json
index 624d96ab6..ebea32b6c 100644
--- a/data/helm_lite/meta/llama-65b/3e27a5c3-a752-4790-b219-5964331e40ac.json
+++ b/data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-65b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-65b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "LLaMA (65B)",
+    "name": "LLaMA 65B",
     "id": "meta/llama-65b",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.345,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/microsoft/phi-2/061081c1-6044-40ec-b4a7-1668b8f3ba4f.json b/data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json
similarity index 76%
rename from data/helm_lite/microsoft/phi-2/061081c1-6044-40ec-b4a7-1668b8f3ba4f.json
rename to data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json
index 42e0ca1f2..ee330c2d2 100644
--- a/data/helm_lite/microsoft/phi-2/061081c1-6044-40ec-b4a7-1668b8f3ba4f.json
+++ b/data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/microsoft_phi-2/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/microsoft_phi-2/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.169,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/microsoft/phi-3-medium-4k-instruct/33df0ce7-048b-4a1b-816c-a6221afe41de.json b/data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json
similarity index 76%
rename from data/helm_lite/microsoft/phi-3-medium-4k-instruct/33df0ce7-048b-4a1b-816c-a6221afe41de.json
rename to data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json
index 40407df59..6d945026f 100644
--- a/data/helm_lite/microsoft/phi-3-medium-4k-instruct/33df0ce7-048b-4a1b-816c-a6221afe41de.json
+++ b/data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/microsoft_phi-3-medium-4k-instruct/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/microsoft_phi-3-medium-4k-instruct/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Phi-3 (14B)",
+    "name": "Phi-3 14B",
     "id": "microsoft/phi-3-medium-4k-instruct",
     "developer": "microsoft",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.509,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/microsoft/phi-3-small-8k-instruct/a3f47cc2-0563-4285-b777-0fcc3c642249.json b/data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json
similarity index 76%
rename from data/helm_lite/microsoft/phi-3-small-8k-instruct/a3f47cc2-0563-4285-b777-0fcc3c642249.json
rename to data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json
index 4a88d2532..c7b88764b 100644
--- a/data/helm_lite/microsoft/phi-3-small-8k-instruct/a3f47cc2-0563-4285-b777-0fcc3c642249.json
+++ b/data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/microsoft_phi-3-small-8k-instruct/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/microsoft_phi-3-small-8k-instruct/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Phi-3 (7B)",
+    "name": "Phi-3 7B",
     "id": "microsoft/phi-3-small-8k-instruct",
     "developer": "microsoft",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.473,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/067ef4d7-387c-4c09-a1c4-a10af69811f0.json b/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json
similarity index 76%
rename from data/helm_lite/mistralai/mistral-7b-instruct-v0.3/067ef4d7-387c-4c09-a1c4-a10af69811f0.json
rename to data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json
index 81cb62772..fd0f8e02b 100644
--- a/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/067ef4d7-387c-4c09-a1c4-a10af69811f0.json
+++ b/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Instruct v0.3 (7B)",
+    "name": "Mistral Instruct v0.3 7B",
     "id": "mistralai/mistral-7b-instruct-v0.3",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.196,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/mistral-7b-v0.1/0a07f39c-745a-46c3-ad11-c79a50cc18bb.json b/data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json
similarity index 76%
rename from data/helm_lite/mistralai/mistral-7b-v0.1/0a07f39c-745a-46c3-ad11-c79a50cc18bb.json
rename to data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json
index 17ebd8348..8f4801f23 100644
--- a/data/helm_lite/mistralai/mistral-7b-v0.1/0a07f39c-745a-46c3-ad11-c79a50cc18bb.json
+++ b/data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-7b-v0.1/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_mistral-7b-v0.1/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral v0.1 (7B)",
+    "name": "Mistral v0.1 7B",
     "id": "mistralai/mistral-7b-v0.1",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.292,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/mistral-large-2402/35797854-d46a-4646-94a2-3acf1d484418.json b/data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json
similarity index 76%
rename from data/helm_lite/mistralai/mistral-large-2402/35797854-d46a-4646-94a2-3acf1d484418.json
rename to data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json
index ca506f27c..d8d60cc37 100644
--- a/data/helm_lite/mistralai/mistral-large-2402/35797854-d46a-4646-94a2-3acf1d484418.json
+++ b/data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-large-2402/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_mistral-large-2402/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Large (2402)",
+    "name": "Mistral Large 2402",
     "id": "mistralai/mistral-large-2402",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.328,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/mistral-large-2407/3f1d4124-11ca-43af-ae0a-ae08b05d2a73.json b/data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json
similarity index 76%
rename from data/helm_lite/mistralai/mistral-large-2407/3f1d4124-11ca-43af-ae0a-ae08b05d2a73.json
rename to data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json
index a10172374..d75c9932b 100644
--- a/data/helm_lite/mistralai/mistral-large-2407/3f1d4124-11ca-43af-ae0a-ae08b05d2a73.json
+++ b/data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-large-2407/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_mistral-large-2407/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Large 2 (2407)",
+    "name": "Mistral Large 2 2407",
     "id": "mistralai/mistral-large-2407",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.744,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/mistral-medium-2312/33bd2b4e-0292-47b7-84de-de6ff5804257.json b/data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json
similarity index 76%
rename from data/helm_lite/mistralai/mistral-medium-2312/33bd2b4e-0292-47b7-84de-de6ff5804257.json
rename to data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json
index 966d4c393..6bb7115e2 100644
--- a/data/helm_lite/mistralai/mistral-medium-2312/33bd2b4e-0292-47b7-84de-de6ff5804257.json
+++ b/data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-medium-2312/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_mistral-medium-2312/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Medium (2312)",
+    "name": "Mistral Medium 2312",
     "id": "mistralai/mistral-medium-2312",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.268,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/mistral-small-2402/67edb54d-efed-4a23-97ef-6d2a9f254ae1.json b/data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json
similarity index 76%
rename from data/helm_lite/mistralai/mistral-small-2402/67edb54d-efed-4a23-97ef-6d2a9f254ae1.json
rename to data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json
index 039a9d5cc..1f2cb2632 100644
--- a/data/helm_lite/mistralai/mistral-small-2402/67edb54d-efed-4a23-97ef-6d2a9f254ae1.json
+++ b/data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-small-2402/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_mistral-small-2402/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Small (2402)",
+    "name": "Mistral Small 2402",
     "id": "mistralai/mistral-small-2402",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.288,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/mixtral-8x22b/ba5dc39a-9a5b-4523-be26-b8d784c2a5ef.json b/data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json
similarity index 76%
rename from data/helm_lite/mistralai/mixtral-8x22b/ba5dc39a-9a5b-4523-be26-b8d784c2a5ef.json
rename to data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json
index 781bbb2c8..e6bfd0332 100644
--- a/data/helm_lite/mistralai/mixtral-8x22b/ba5dc39a-9a5b-4523-be26-b8d784c2a5ef.json
+++ b/data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_mixtral-8x22b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_mixtral-8x22b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mixtral (8x22B)",
+    "name": "Mixtral 8x22B",
     "id": "mistralai/mixtral-8x22b",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.705,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/469d069f-581e-415c-9c9d-f57e7c972da5.json b/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json
similarity index 76%
rename from data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/469d069f-581e-415c-9c9d-f57e7c972da5.json
rename to data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json
index 818a4bd2a..7bf0323b1 100644
--- a/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/469d069f-581e-415c-9c9d-f57e7c972da5.json
+++ b/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_mixtral-8x7b-32kseqlen/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_mixtral-8x7b-32kseqlen/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mixtral (8x7B 32K seqlen)",
+    "name": "Mixtral 8x7B 32K seqlen",
     "id": "mistralai/mixtral-8x7b-32kseqlen",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.51,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/open-mistral-nemo-2407/c9a3f927-041f-47cf-ae02-03fe4be0a59e.json b/data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json
similarity index 76%
rename from data/helm_lite/mistralai/open-mistral-nemo-2407/c9a3f927-041f-47cf-ae02-03fe4be0a59e.json
rename to data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json
index dfc851db9..7fee5cb57 100644
--- a/data/helm_lite/mistralai/open-mistral-nemo-2407/c9a3f927-041f-47cf-ae02-03fe4be0a59e.json
+++ b/data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_open-mistral-nemo-2407/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_open-mistral-nemo-2407/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral NeMo (2402)",
+    "name": "Mistral NeMo 2402",
     "id": "mistralai/open-mistral-nemo-2407",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.333,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/gpt-3.5-turbo-0613/1a8c4f2e-04a0-4c08-8966-d7eaa7dd6462.json b/data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json
similarity index 76%
rename from data/helm_lite/openai/gpt-3.5-turbo-0613/1a8c4f2e-04a0-4c08-8966-d7eaa7dd6462.json
rename to data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json
index 28acf453d..878d33981 100644
--- a/data/helm_lite/openai/gpt-3.5-turbo-0613/1a8c4f2e-04a0-4c08-8966-d7eaa7dd6462.json
+++ b/data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_gpt-3.5-turbo-0613/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_gpt-3.5-turbo-0613/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-3.5 Turbo (0613)",
+    "name": "GPT-3.5 Turbo 0613",
     "id": "openai/gpt-3.5-turbo-0613",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.358,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/gpt-4-0613/4e58fdd9-e14c-441a-a9fb-4c525a615880.json b/data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json
similarity index 76%
rename from data/helm_lite/openai/gpt-4-0613/4e58fdd9-e14c-441a-a9fb-4c525a615880.json
rename to data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json
index 6fa2534b1..7ff111f74 100644
--- a/data/helm_lite/openai/gpt-4-0613/4e58fdd9-e14c-441a-a9fb-4c525a615880.json
+++ b/data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_gpt-4-0613/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_gpt-4-0613/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4 (0613)",
+    "name": "GPT-4 0613",
     "id": "openai/gpt-4-0613",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.867,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/gpt-4-1106-preview/252ec309-9b98-463e-aee4-6e28deb6dcfb.json b/data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json
similarity index 76%
rename from data/helm_lite/openai/gpt-4-1106-preview/252ec309-9b98-463e-aee4-6e28deb6dcfb.json
rename to data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json
index c0d921b54..060ab8fb5 100644
--- a/data/helm_lite/openai/gpt-4-1106-preview/252ec309-9b98-463e-aee4-6e28deb6dcfb.json
+++ b/data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_gpt-4-1106-preview/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_gpt-4-1106-preview/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4 Turbo (1106 preview)",
+    "name": "GPT-4 Turbo 1106 preview",
     "id": "openai/gpt-4-1106-preview",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.698,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/gpt-4-turbo-2024-04-09/5530c426-2321-4aa3-b860-f9b764b7b748.json b/data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json
similarity index 76%
rename from data/helm_lite/openai/gpt-4-turbo-2024-04-09/5530c426-2321-4aa3-b860-f9b764b7b748.json
rename to data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json
index 599344447..dae83b652 100644
--- a/data/helm_lite/openai/gpt-4-turbo-2024-04-09/5530c426-2321-4aa3-b860-f9b764b7b748.json
+++ b/data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4 Turbo (2024-04-09)",
+    "name": "GPT-4 Turbo 2024-04-09",
     "id": "openai/gpt-4-turbo-2024-04-09",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.864,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/gpt-4o-2024-05-13/da92cfe0-b066-416a-9408-3eb9d36b9fb3.json b/data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json
similarity index 76%
rename from data/helm_lite/openai/gpt-4o-2024-05-13/da92cfe0-b066-416a-9408-3eb9d36b9fb3.json
rename to data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json
index 98feb8bc0..c23053f17 100644
--- a/data/helm_lite/openai/gpt-4o-2024-05-13/da92cfe0-b066-416a-9408-3eb9d36b9fb3.json
+++ b/data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_gpt-4o-2024-05-13/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_gpt-4o-2024-05-13/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4o (2024-05-13)",
+    "name": "GPT-4o 2024-05-13",
     "id": "openai/gpt-4o-2024-05-13",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.938,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/gpt-4o-2024-08-06/2a752701-a826-4316-b3eb-e9eec90a5a89.json b/data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json
similarity index 76%
rename from data/helm_lite/openai/gpt-4o-2024-08-06/2a752701-a826-4316-b3eb-e9eec90a5a89.json
rename to data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json
index cb595e51b..f8d7c3614 100644
--- a/data/helm_lite/openai/gpt-4o-2024-08-06/2a752701-a826-4316-b3eb-e9eec90a5a89.json
+++ b/data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_gpt-4o-2024-08-06/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_gpt-4o-2024-08-06/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4o (2024-08-06)",
+    "name": "GPT-4o 2024-08-06",
     "id": "openai/gpt-4o-2024-08-06",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.928,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/gpt-4o-mini-2024-07-18/bea4af4b-8155-4784-9192-b40270d574af.json b/data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json
similarity index 76%
rename from data/helm_lite/openai/gpt-4o-mini-2024-07-18/bea4af4b-8155-4784-9192-b40270d574af.json
rename to data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json
index 3fb056373..3869cb246 100644
--- a/data/helm_lite/openai/gpt-4o-mini-2024-07-18/bea4af4b-8155-4784-9192-b40270d574af.json
+++ b/data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_gpt-4o-mini-2024-07-18/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_gpt-4o-mini-2024-07-18/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4o mini (2024-07-18)",
+    "name": "GPT-4o mini 2024-07-18",
     "id": "openai/gpt-4o-mini-2024-07-18",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.701,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/text-davinci-002/d08eccd1-602c-4d64-a487-2d9c028459a0.json b/data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json
similarity index 76%
rename from data/helm_lite/openai/text-davinci-002/d08eccd1-602c-4d64-a487-2d9c028459a0.json
rename to data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json
index d390f5b2a..f3294dd85 100644
--- a/data/helm_lite/openai/text-davinci-002/d08eccd1-602c-4d64-a487-2d9c028459a0.json
+++ b/data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_text-davinci-002/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_text-davinci-002/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-3.5 (text-davinci-002)",
+    "name": "GPT-3.5 text-davinci-002",
     "id": "openai/text-davinci-002",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.336,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/text-davinci-003/3cceb22d-7ce9-49a1-a677-548a97c52970.json b/data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json
similarity index 76%
rename from data/helm_lite/openai/text-davinci-003/3cceb22d-7ce9-49a1-a677-548a97c52970.json
rename to data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json
index 99961f779..93f27df2b 100644
--- a/data/helm_lite/openai/text-davinci-003/3cceb22d-7ce9-49a1-a677-548a97c52970.json
+++ b/data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_text-davinci-003/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_text-davinci-003/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-3.5 (text-davinci-003)",
+    "name": "GPT-3.5 text-davinci-003",
     "id": "openai/text-davinci-003",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.439,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/qwen/qwen1.5-110b-chat/6fd88ffb-a8b3-4f30-be39-38d4532ca16d.json b/data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json
similarity index 76%
rename from data/helm_lite/qwen/qwen1.5-110b-chat/6fd88ffb-a8b3-4f30-be39-38d4532ca16d.json
rename to data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json
index 6aed691a1..800f57826 100644
--- a/data/helm_lite/qwen/qwen1.5-110b-chat/6fd88ffb-a8b3-4f30-be39-38d4532ca16d.json
+++ b/data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-110b-chat/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/qwen_qwen1.5-110b-chat/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 Chat (110B)",
+    "name": "Qwen1.5 Chat 110B",
     "id": "qwen/qwen1.5-110b-chat",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.55,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/qwen/qwen1.5-14b/9b1ee735-bc25-48fd-94cd-24f17edcdc21.json b/data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json
similarity index 76%
rename from data/helm_lite/qwen/qwen1.5-14b/9b1ee735-bc25-48fd-94cd-24f17edcdc21.json
rename to data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json
index f6c7858eb..c8749e5f5 100644
--- a/data/helm_lite/qwen/qwen1.5-14b/9b1ee735-bc25-48fd-94cd-24f17edcdc21.json
+++ b/data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-14b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/qwen_qwen1.5-14b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 (14B)",
+    "name": "Qwen1.5 14B",
     "id": "qwen/qwen1.5-14b",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.425,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/qwen/qwen1.5-32b/a648cb90-bcce-4171-a664-df0b19304833.json b/data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json
similarity index 76%
rename from data/helm_lite/qwen/qwen1.5-32b/a648cb90-bcce-4171-a664-df0b19304833.json
rename to data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json
index 1314aa204..699c1515b 100644
--- a/data/helm_lite/qwen/qwen1.5-32b/a648cb90-bcce-4171-a664-df0b19304833.json
+++ b/data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-32b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/qwen_qwen1.5-32b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 (32B)",
+    "name": "Qwen1.5 32B",
     "id": "qwen/qwen1.5-32b",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.546,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/qwen/qwen1.5-72b/5dace0c5-46f5-4ad4-ac48-1daacee28fe6.json b/data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json
similarity index 76%
rename from data/helm_lite/qwen/qwen1.5-72b/5dace0c5-46f5-4ad4-ac48-1daacee28fe6.json
rename to data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json
index 6da42bd5b..8b347b68d 100644
--- a/data/helm_lite/qwen/qwen1.5-72b/5dace0c5-46f5-4ad4-ac48-1daacee28fe6.json
+++ b/data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-72b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/qwen_qwen1.5-72b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 (72B)",
+    "name": "Qwen1.5 72B",
     "id": "qwen/qwen1.5-72b",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.608,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/qwen/qwen1.5-7b/71d69629-11b9-4f06-98ca-536f1ab22f2c.json b/data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json
similarity index 76%
rename from data/helm_lite/qwen/qwen1.5-7b/71d69629-11b9-4f06-98ca-536f1ab22f2c.json
rename to data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json
index a4d0226b9..b1bc89d92 100644
--- a/data/helm_lite/qwen/qwen1.5-7b/71d69629-11b9-4f06-98ca-536f1ab22f2c.json
+++ b/data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-7b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/qwen_qwen1.5-7b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 (7B)",
+    "name": "Qwen1.5 7B",
     "id": "qwen/qwen1.5-7b",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.275,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/qwen/qwen2-72b-instruct/a594b434-eeb2-41f5-b23d-eea23ed2add2.json b/data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json
similarity index 76%
rename from data/helm_lite/qwen/qwen2-72b-instruct/a594b434-eeb2-41f5-b23d-eea23ed2add2.json
rename to data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json
index 4e8665e6b..58edcde03 100644
--- a/data/helm_lite/qwen/qwen2-72b-instruct/a594b434-eeb2-41f5-b23d-eea23ed2add2.json
+++ b/data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/qwen_qwen2-72b-instruct/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/qwen_qwen2-72b-instruct/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen2 Instruct (72B)",
+    "name": "Qwen2 Instruct 72B",
     "id": "qwen/qwen2-72b-instruct",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.77,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/e6a833e5-6b86-4d32-be03-010fdfde3ffc.json b/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json
similarity index 76%
rename from data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/e6a833e5-6b86-4d32-be03-010fdfde3ffc.json
rename to data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json
index 9e7699d4b..3e08a0cdf 100644
--- a/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/e6a833e5-6b86-4d32-be03-010fdfde3ffc.json
+++ b/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/qwen_qwen2.5-72b-instruct-turbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/qwen_qwen2.5-72b-instruct-turbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen2.5 Instruct Turbo (72B)",
+    "name": "Qwen2.5 Instruct Turbo 72B",
     "id": "qwen/qwen2.5-72b-instruct-turbo",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.745,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,20 +506,29 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ],
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +571,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -545,13 +629,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/cc7d5cc5-d91e-4e54-bbff-dfc867586c77.json b/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json
similarity index 76%
rename from data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/cc7d5cc5-d91e-4e54-bbff-dfc867586c77.json
rename to data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json
index 126ae4e72..3f844c281 100644
--- a/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/cc7d5cc5-d91e-4e54-bbff-dfc867586c77.json
+++ b/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/qwen_qwen2.5-7b-instruct-turbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/qwen_qwen2.5-7b-instruct-turbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen2.5 Instruct Turbo (7B)",
+    "name": "Qwen2.5 Instruct Turbo 7B",
     "id": "qwen/qwen2.5-7b-instruct-turbo",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.488,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,20 +506,29 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ],
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -496,12 +571,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -545,13 +629,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/snowflake/snowflake-arctic-instruct/2fb84697-ac0c-4d3f-a2be-74a9bd3f5797.json b/data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json
similarity index 77%
rename from data/helm_lite/snowflake/snowflake-arctic-instruct/2fb84697-ac0c-4d3f-a2be-74a9bd3f5797.json
rename to data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json
index a52059819..09f377d89 100644
--- a/data/helm_lite/snowflake/snowflake-arctic-instruct/2fb84697-ac0c-4d3f-a2be-74a9bd3f5797.json
+++ b/data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.338,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/tiiuae/falcon-40b/346c2a85-3daf-41e9-9305-78851dcf05ae.json b/data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json
similarity index 76%
rename from data/helm_lite/tiiuae/falcon-40b/346c2a85-3daf-41e9-9305-78851dcf05ae.json
rename to data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json
index 518458e37..2bf240f96 100644
--- a/data/helm_lite/tiiuae/falcon-40b/346c2a85-3daf-41e9-9305-78851dcf05ae.json
+++ b/data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/tiiuae_falcon-40b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/tiiuae_falcon-40b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Falcon (40B)",
+    "name": "Falcon 40B",
     "id": "tiiuae/falcon-40b",
     "developer": "tiiuae",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.217,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/tiiuae/falcon-7b/69e02d7b-d536-4ff4-a58e-b880ff87f357.json b/data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json
similarity index 76%
rename from data/helm_lite/tiiuae/falcon-7b/69e02d7b-d536-4ff4-a58e-b880ff87f357.json
rename to data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json
index 4a1515414..9a704269c 100644
--- a/data/helm_lite/tiiuae/falcon-7b/69e02d7b-d536-4ff4-a58e-b880ff87f357.json
+++ b/data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/tiiuae_falcon-7b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/tiiuae_falcon-7b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Falcon (7B)",
+    "name": "Falcon 7B",
     "id": "tiiuae/falcon-7b",
     "developer": "tiiuae",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.064,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/upstage/solar-pro-241126/3286a69f-cdba-49a5-939a-e14ad759e7a4.json b/data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json
similarity index 76%
rename from data/helm_lite/upstage/solar-pro-241126/3286a69f-cdba-49a5-939a-e14ad759e7a4.json
rename to data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json
index fd33bd463..1f111d01c 100644
--- a/data/helm_lite/upstage/solar-pro-241126/3286a69f-cdba-49a5-939a-e14ad759e7a4.json
+++ b/data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/upstage_solar-pro-241126/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/upstage_solar-pro-241126/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.602,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -390,13 +447,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -440,19 +506,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -495,12 +570,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/writer/palmyra-x-004/b798adc1-01f0-46c5-95a4-8b67199d624b.json b/data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json
similarity index 76%
rename from data/helm_lite/writer/palmyra-x-004/b798adc1-01f0-46c5-95a4-8b67199d624b.json
rename to data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json
index 574c20cd8..8026be475 100644
--- a/data/helm_lite/writer/palmyra-x-004/b798adc1-01f0-46c5-95a4-8b67199d624b.json
+++ b/data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/writer_palmyra-x-004/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/writer_palmyra-x-004/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.808,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -93,13 +105,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -173,14 +194,23 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook",
-        "stop": "none"
+        "additional_details": {
+          "mode": "closedbook",
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -224,14 +254,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -275,20 +314,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -332,25 +380,34 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True",
-        "stop": "none"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True",
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,13 +451,22 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -444,20 +510,29 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ],
+          "stop": "none"
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -500,12 +575,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -549,14 +633,16 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ],
+          "stop": "none"
+        }
       }
     }
   ]
diff --git a/data/helm_lite/writer/palmyra-x-v2/7a07a202-aa88-47fc-987d-6d44a57b6985.json b/data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json
similarity index 76%
rename from data/helm_lite/writer/palmyra-x-v2/7a07a202-aa88-47fc-987d-6d44a57b6985.json
rename to data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json
index 85f887f2f..5e5faf9fb 100644
--- a/data/helm_lite/writer/palmyra-x-v2/7a07a202-aa88-47fc-987d-6d44a57b6985.json
+++ b/data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/writer_palmyra-x-v2/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/writer_palmyra-x-v2/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Palmyra X V2 (33B)",
+    "name": "Palmyra X V2 33B",
     "id": "writer/palmyra-x-v2",
     "developer": "writer",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.589,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/writer/palmyra-x-v3/ac0a7249-11e7-493d-9190-8c1913bb1c42.json b/data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json
similarity index 76%
rename from data/helm_lite/writer/palmyra-x-v3/ac0a7249-11e7-493d-9190-8c1913bb1c42.json
rename to data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json
index ae69f6c5b..c8073d254 100644
--- a/data/helm_lite/writer/palmyra-x-v3/ac0a7249-11e7-493d-9190-8c1913bb1c42.json
+++ b/data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/writer_palmyra-x-v3/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/writer_palmyra-x-v3/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Palmyra X V3 (72B)",
+    "name": "Palmyra X V3 72B",
     "id": "writer/palmyra-x-v3",
     "developer": "writer",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.679,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,12 +47,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -92,12 +104,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,13 +192,22 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -221,14 +251,23 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -272,20 +311,29 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -329,24 +377,33 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -389,12 +446,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -438,19 +504,28 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -493,12 +568,21 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_mmlu/01-ai/yi-34b/73d9f70c-acbb-4dfa-ae8e-e5c4f6b74c9a.json b/data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json
similarity index 78%
rename from data/helm_mmlu/01-ai/yi-34b/73d9f70c-acbb-4dfa-ae8e-e5c4f6b74c9a.json
rename to data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json
index a5bdb42fc..a5d4de71f 100644
--- a/data/helm_mmlu/01-ai/yi-34b/73d9f70c-acbb-4dfa-ae8e-e5c4f6b74c9a.json
+++ b/data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/01-ai_yi-34b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/01-ai_yi-34b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Yi (34B)",
+    "name": "Yi 34B",
     "id": "01-ai/yi-34b",
     "developer": "01-ai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.315,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/01-ai/yi-6b/97569bf5-1e12-4baa-80cc-019be1725ebb.json b/data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json
similarity index 78%
rename from data/helm_mmlu/01-ai/yi-6b/97569bf5-1e12-4baa-80cc-019be1725ebb.json
rename to data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json
index 5b2c50278..1f0a7e20f 100644
--- a/data/helm_mmlu/01-ai/yi-6b/97569bf5-1e12-4baa-80cc-019be1725ebb.json
+++ b/data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/01-ai_yi-6b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/01-ai_yi-6b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Yi (6B)",
+    "name": "Yi 6B",
     "id": "01-ai/yi-6b",
     "developer": "01-ai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.651,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/01-ai/yi-large-preview/7c4b387f-45be-41cb-8102-cd738e60f99d.json b/data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json
similarity index 78%
rename from data/helm_mmlu/01-ai/yi-large-preview/7c4b387f-45be-41cb-8102-cd738e60f99d.json
rename to data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json
index 938fbc9f2..4838cda1c 100644
--- a/data/helm_mmlu/01-ai/yi-large-preview/7c4b387f-45be-41cb-8102-cd738e60f99d.json
+++ b/data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/01-ai_yi-large-preview/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/01-ai_yi-large-preview/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Yi Large (Preview)",
+    "name": "Yi Large Preview",
     "id": "01-ai/yi-large-preview",
     "developer": "01-ai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.258,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/ai21/jamba-1.5-large/027b7bd4-8943-4d2c-9674-15d33792d391.json b/data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json
similarity index 78%
rename from data/helm_mmlu/ai21/jamba-1.5-large/027b7bd4-8943-4d2c-9674-15d33792d391.json
rename to data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json
index b05362e32..45536e1a1 100644
--- a/data/helm_mmlu/ai21/jamba-1.5-large/027b7bd4-8943-4d2c-9674-15d33792d391.json
+++ b/data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/ai21_jamba-1.5-large/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/ai21_jamba-1.5-large/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -19,9 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.147,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/ai21/jamba-1.5-mini/e5ed6c70-6874-4671-abb0-25bbd82471b4.json b/data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json
similarity index 78%
rename from data/helm_mmlu/ai21/jamba-1.5-mini/e5ed6c70-6874-4671-abb0-25bbd82471b4.json
rename to data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json
index 374350118..727c60261 100644
--- a/data/helm_mmlu/ai21/jamba-1.5-mini/e5ed6c70-6874-4671-abb0-25bbd82471b4.json
+++ b/data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/ai21_jamba-1.5-mini/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/ai21_jamba-1.5-mini/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -19,9 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.206,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/ai21/jamba-instruct/4e236f80-5d03-4547-b199-b8718439fbed.json b/data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json
similarity index 78%
rename from data/helm_mmlu/ai21/jamba-instruct/4e236f80-5d03-4547-b199-b8718439fbed.json
rename to data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json
index 2f32db71e..3a25316d9 100644
--- a/data/helm_mmlu/ai21/jamba-instruct/4e236f80-5d03-4547-b199-b8718439fbed.json
+++ b/data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/ai21_jamba-instruct/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/ai21_jamba-instruct/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -19,9 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.887,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/allenai/olmo-1.7-7b/1fffb281-ad0f-4e46-9e18-f7e6643f9f28.json b/data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json
similarity index 78%
rename from data/helm_mmlu/allenai/olmo-1.7-7b/1fffb281-ad0f-4e46-9e18-f7e6643f9f28.json
rename to data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json
index 0ee329ec3..8bf036c64 100644
--- a/data/helm_mmlu/allenai/olmo-1.7-7b/1fffb281-ad0f-4e46-9e18-f7e6643f9f28.json
+++ b/data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/allenai_olmo-1.7-7b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/allenai_olmo-1.7-7b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "OLMo 1.7 (7B)",
+    "name": "OLMo 1.7 7B",
     "id": "allenai/olmo-1.7-7b",
     "developer": "allenai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.196,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/allenai/olmo-7b/31666792-6d68-42da-95f8-3b9f8590c7fd.json b/data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json
similarity index 78%
rename from data/helm_mmlu/allenai/olmo-7b/31666792-6d68-42da-95f8-3b9f8590c7fd.json
rename to data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json
index dc71abcb3..2b8d4cdfb 100644
--- a/data/helm_mmlu/allenai/olmo-7b/31666792-6d68-42da-95f8-3b9f8590c7fd.json
+++ b/data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/allenai_olmo-7b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/allenai_olmo-7b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "OLMo (7B)",
+    "name": "OLMo 7B",
     "id": "allenai/olmo-7b",
     "developer": "allenai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.68,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/amazon/nova-lite-v1:0/c77fc3bf-1481-46c2-8f29-9930e42c4567.json b/data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json
similarity index 78%
rename from data/helm_mmlu/amazon/nova-lite-v1:0/c77fc3bf-1481-46c2-8f29-9930e42c4567.json
rename to data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json
index 036d68cdd..1bb99dccc 100644
--- a/data/helm_mmlu/amazon/nova-lite-v1:0/c77fc3bf-1481-46c2-8f29-9930e42c4567.json
+++ b/data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/amazon_nova-lite-v1:0/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/amazon_nova-lite-v1:0/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -19,9 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.987,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/amazon/nova-micro-v1:0/1ca3812c-50a8-455c-b2dc-54cca6ec8123.json b/data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json
similarity index 78%
rename from data/helm_mmlu/amazon/nova-micro-v1:0/1ca3812c-50a8-455c-b2dc-54cca6ec8123.json
rename to data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json
index dc2e53d31..ab9b8c843 100644
--- a/data/helm_mmlu/amazon/nova-micro-v1:0/1ca3812c-50a8-455c-b2dc-54cca6ec8123.json
+++ b/data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/amazon_nova-micro-v1:0/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/amazon_nova-micro-v1:0/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -19,9 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 1.0,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/amazon/nova-pro-v1:0/28265def-113d-4e90-9ba9-02dfe86f5ad2.json b/data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json
similarity index 78%
rename from data/helm_mmlu/amazon/nova-pro-v1:0/28265def-113d-4e90-9ba9-02dfe86f5ad2.json
rename to data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json
index 74dd04dc4..af30c4448 100644
--- a/data/helm_mmlu/amazon/nova-pro-v1:0/28265def-113d-4e90-9ba9-02dfe86f5ad2.json
+++ b/data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/amazon_nova-pro-v1:0/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/amazon_nova-pro-v1:0/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -19,9 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.975,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-2.1/357edc36-d500-4e6e-94a4-6653b769b5d8.json b/data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json
similarity index 78%
rename from data/helm_mmlu/anthropic/claude-2.1/357edc36-d500-4e6e-94a4-6653b769b5d8.json
rename to data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json
index 94c86600d..c2616d7f8 100644
--- a/data/helm_mmlu/anthropic/claude-2.1/357edc36-d500-4e6e-94a4-6653b769b5d8.json
+++ b/data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-2.1/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/anthropic_claude-2.1/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -19,9 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.048,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/67f72a7f-15b7-4a2e-b478-38091cba2189.json b/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json
similarity index 78%
rename from data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/67f72a7f-15b7-4a2e-b478-38091cba2189.json
rename to data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json
index 15ba960b1..76628bf51 100644
--- a/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/67f72a7f-15b7-4a2e-b478-38091cba2189.json
+++ b/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-haiku-20241022/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-haiku-20241022/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.5 Haiku (20241022)",
+    "name": "Claude 3.5 Haiku 20241022",
     "id": "anthropic/claude-3-5-haiku-20241022",
     "developer": "anthropic",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.128,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/3aeb81a2-9e35-4fbc-ab31-d94cffc5d17d.json b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json
similarity index 78%
rename from data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/3aeb81a2-9e35-4fbc-ab31-d94cffc5d17d.json
rename to data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json
index 43e320af9..9d9557efc 100644
--- a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/3aeb81a2-9e35-4fbc-ab31-d94cffc5d17d.json
+++ b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20240620/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20240620/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.5 Sonnet (20240620)",
+    "name": "Claude 3.5 Sonnet 20240620",
     "id": "anthropic/claude-3-5-sonnet-20240620",
     "developer": "anthropic",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.17,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/f8f66e38-00b1-4150-84bf-466ffc8ce6a2.json b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json
similarity index 78%
rename from data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/f8f66e38-00b1-4150-84bf-466ffc8ce6a2.json
rename to data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json
index 7df36bb32..35be68aa6 100644
--- a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/f8f66e38-00b1-4150-84bf-466ffc8ce6a2.json
+++ b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20241022/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20241022/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.5 Sonnet (20241022)",
+    "name": "Claude 3.5 Sonnet 20241022",
     "id": "anthropic/claude-3-5-sonnet-20241022",
     "developer": "anthropic",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.311,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-haiku-20240307/b0218eab-984f-4829-90d6-e7fc6f60c530.json b/data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json
similarity index 78%
rename from data/helm_mmlu/anthropic/claude-3-haiku-20240307/b0218eab-984f-4829-90d6-e7fc6f60c530.json
rename to data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json
index 9885a79d4..969900aba 100644
--- a/data/helm_mmlu/anthropic/claude-3-haiku-20240307/b0218eab-984f-4829-90d6-e7fc6f60c530.json
+++ b/data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-haiku-20240307/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-haiku-20240307/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3 Haiku (20240307)",
+    "name": "Claude 3 Haiku 20240307",
     "id": "anthropic/claude-3-haiku-20240307",
     "developer": "anthropic",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.28,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-opus-20240229/fb4270e9-d4a6-45ea-b47b-d0cf82ea1a2d.json b/data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json
similarity index 78%
rename from data/helm_mmlu/anthropic/claude-3-opus-20240229/fb4270e9-d4a6-45ea-b47b-d0cf82ea1a2d.json
rename to data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json
index ab57a1503..230be4291 100644
--- a/data/helm_mmlu/anthropic/claude-3-opus-20240229/fb4270e9-d4a6-45ea-b47b-d0cf82ea1a2d.json
+++ b/data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-opus-20240229/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-opus-20240229/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3 Opus (20240229)",
+    "name": "Claude 3 Opus 20240229",
     "id": "anthropic/claude-3-opus-20240229",
     "developer": "anthropic",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.014,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/08d951d1-2912-4a00-99ce-f90340a7fd2a.json b/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json
similarity index 78%
rename from data/helm_mmlu/anthropic/claude-3-sonnet-20240229/08d951d1-2912-4a00-99ce-f90340a7fd2a.json
rename to data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json
index 710c70a2e..dd7543ecb 100644
--- a/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/08d951d1-2912-4a00-99ce-f90340a7fd2a.json
+++ b/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-sonnet-20240229/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-sonnet-20240229/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3 Sonnet (20240229)",
+    "name": "Claude 3 Sonnet 20240229",
     "id": "anthropic/claude-3-sonnet-20240229",
     "developer": "anthropic",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.082,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-instant-1.2/bfff8f1b-24cc-41b8-b11c-85ee48bef059.json b/data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json
similarity index 78%
rename from data/helm_mmlu/anthropic/claude-instant-1.2/bfff8f1b-24cc-41b8-b11c-85ee48bef059.json
rename to data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json
index b632a0864..c9e9779b1 100644
--- a/data/helm_mmlu/anthropic/claude-instant-1.2/bfff8f1b-24cc-41b8-b11c-85ee48bef059.json
+++ b/data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-instant-1.2/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/anthropic_claude-instant-1.2/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -19,9 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.186,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/cohere/command-r-plus/f1509273-dea1-477e-bf04-02767838c1f9.json b/data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json
similarity index 78%
rename from data/helm_mmlu/cohere/command-r-plus/f1509273-dea1-477e-bf04-02767838c1f9.json
rename to data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json
index 6ef0cc597..6bebd236d 100644
--- a/data/helm_mmlu/cohere/command-r-plus/f1509273-dea1-477e-bf04-02767838c1f9.json
+++ b/data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/cohere_command-r-plus/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/cohere_command-r-plus/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -19,9 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.825,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/cohere/command-r/45524eef-0678-47db-8620-a5f55e166e63.json b/data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json
similarity index 78%
rename from data/helm_mmlu/cohere/command-r/45524eef-0678-47db-8620-a5f55e166e63.json
rename to data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json
index 6fa172bf8..e82639d82 100644
--- a/data/helm_mmlu/cohere/command-r/45524eef-0678-47db-8620-a5f55e166e63.json
+++ b/data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/cohere_command-r/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/cohere_command-r/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -19,9 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.959,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/databricks/dbrx-instruct/cd2371e9-e552-4944-bc30-c2269c960e16.json b/data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json
similarity index 78%
rename from data/helm_mmlu/databricks/dbrx-instruct/cd2371e9-e552-4944-bc30-c2269c960e16.json
rename to data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json
index 8d0b57f82..d5f73b61f 100644
--- a/data/helm_mmlu/databricks/dbrx-instruct/cd2371e9-e552-4944-bc30-c2269c960e16.json
+++ b/data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/databricks_dbrx-instruct/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/databricks_dbrx-instruct/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -19,9 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.537,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/7378a9f3-28ad-475c-bdb0-b282f8f52e4e.json b/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json
similarity index 78%
rename from data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/7378a9f3-28ad-475c-bdb0-b282f8f52e4e.json
rename to data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json
index 7837e5696..7ec071041 100644
--- a/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/7378a9f3-28ad-475c-bdb0-b282f8f52e4e.json
+++ b/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-llm-67b-chat/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-llm-67b-chat/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "DeepSeek LLM Chat (67B)",
+    "name": "DeepSeek LLM Chat 67B",
     "id": "deepseek-ai/deepseek-llm-67b-chat",
     "developer": "deepseek-ai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.387,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/deepseek-ai/deepseek-v3/87716ef9-56bb-4737-b578-9e53742c714a.json b/data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json
similarity index 78%
rename from data/helm_mmlu/deepseek-ai/deepseek-v3/87716ef9-56bb-4737-b578-9e53742c714a.json
rename to data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json
index b9d5d50e7..200a6e19c 100644
--- a/data/helm_mmlu/deepseek-ai/deepseek-v3/87716ef9-56bb-4737-b578-9e53742c714a.json
+++ b/data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-v3/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-v3/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -19,9 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.215,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.0-pro-001/8a60d74d-0a32-4aab-9bb9-c12e01a08c2b.json b/data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json
similarity index 78%
rename from data/helm_mmlu/google/gemini-1.0-pro-001/8a60d74d-0a32-4aab-9bb9-c12e01a08c2b.json
rename to data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json
index 4fb164090..86096274a 100644
--- a/data/helm_mmlu/google/gemini-1.0-pro-001/8a60d74d-0a32-4aab-9bb9-c12e01a08c2b.json
+++ b/data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.0-pro-001/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemini-1.0-pro-001/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.0 Pro (001)",
+    "name": "Gemini 1.0 Pro 001",
     "id": "google/gemini-1.0-pro-001",
     "developer": "google",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.677,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-flash-001/ff7e3c87-0c6a-4095-b83a-0fba5468d26d.json b/data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json
similarity index 78%
rename from data/helm_mmlu/google/gemini-1.5-flash-001/ff7e3c87-0c6a-4095-b83a-0fba5468d26d.json
rename to data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json
index a91e47447..7aac2d734 100644
--- a/data/helm_mmlu/google/gemini-1.5-flash-001/ff7e3c87-0c6a-4095-b83a-0fba5468d26d.json
+++ b/data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-001/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-001/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Flash (001)",
+    "name": "Gemini 1.5 Flash 001",
     "id": "google/gemini-1.5-flash-001",
     "developer": "google",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.47,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-flash-002/ec78481a-0b0d-4709-99ea-6423372d6038.json b/data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json
similarity index 78%
rename from data/helm_mmlu/google/gemini-1.5-flash-002/ec78481a-0b0d-4709-99ea-6423372d6038.json
rename to data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json
index c8a9b1912..a87c94c3b 100644
--- a/data/helm_mmlu/google/gemini-1.5-flash-002/ec78481a-0b0d-4709-99ea-6423372d6038.json
+++ b/data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-002/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-002/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Flash (002)",
+    "name": "Gemini 1.5 Flash 002",
     "id": "google/gemini-1.5-flash-002",
     "developer": "google",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.817,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/2a8845b3-cdbc-409c-8346-f83fb607999a.json b/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json
similarity index 78%
rename from data/helm_mmlu/google/gemini-1.5-flash-preview-0514/2a8845b3-cdbc-409c-8346-f83fb607999a.json
rename to data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json
index ffdf7910d..b8d59d877 100644
--- a/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/2a8845b3-cdbc-409c-8346-f83fb607999a.json
+++ b/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-preview-0514/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-preview-0514/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Flash (0514 preview)",
+    "name": "Gemini 1.5 Flash 0514 preview",
     "id": "google/gemini-1.5-flash-preview-0514",
     "developer": "google",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.713,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-pro-001/486b6479-f327-43ab-af2c-8824abaf5fe6.json b/data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json
similarity index 78%
rename from data/helm_mmlu/google/gemini-1.5-pro-001/486b6479-f327-43ab-af2c-8824abaf5fe6.json
rename to data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json
index 0115a3fa0..0632aee68 100644
--- a/data/helm_mmlu/google/gemini-1.5-pro-001/486b6479-f327-43ab-af2c-8824abaf5fe6.json
+++ b/data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-001/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-001/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Pro (001)",
+    "name": "Gemini 1.5 Pro 001",
     "id": "google/gemini-1.5-pro-001",
     "developer": "google",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.349,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-pro-002/4ea206d4-961a-4fc8-824e-b5b8c0f3a36e.json b/data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json
similarity index 78%
rename from data/helm_mmlu/google/gemini-1.5-pro-002/4ea206d4-961a-4fc8-824e-b5b8c0f3a36e.json
rename to data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json
index 1c57dbb48..d6a3ba87a 100644
--- a/data/helm_mmlu/google/gemini-1.5-pro-002/4ea206d4-961a-4fc8-824e-b5b8c0f3a36e.json
+++ b/data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-002/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-002/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Pro (002)",
+    "name": "Gemini 1.5 Pro 002",
     "id": "google/gemini-1.5-pro-002",
     "developer": "google",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.334,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/bedeefc9-8e78-4ce9-9883-b222df8e3ef7.json b/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json
similarity index 78%
rename from data/helm_mmlu/google/gemini-1.5-pro-preview-0409/bedeefc9-8e78-4ce9-9883-b222df8e3ef7.json
rename to data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json
index 065435cc3..de3a77c03 100644
--- a/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/bedeefc9-8e78-4ce9-9883-b222df8e3ef7.json
+++ b/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-preview-0409/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-preview-0409/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Pro (0409 preview)",
+    "name": "Gemini 1.5 Pro 0409 preview",
     "id": "google/gemini-1.5-pro-preview-0409",
     "developer": "google",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.118,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-2.0-flash-exp/0837a2fd-1f25-4133-9ce6-b8ca29830f70.json b/data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json
similarity index 78%
rename from data/helm_mmlu/google/gemini-2.0-flash-exp/0837a2fd-1f25-4133-9ce6-b8ca29830f70.json
rename to data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json
index 9b4101c21..6b53de064 100644
--- a/data/helm_mmlu/google/gemini-2.0-flash-exp/0837a2fd-1f25-4133-9ce6-b8ca29830f70.json
+++ b/data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemini-2.0-flash-exp/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemini-2.0-flash-exp/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 2.0 Flash (Experimental)",
+    "name": "Gemini 2.0 Flash Experimental",
     "id": "google/gemini-2.0-flash-exp",
     "developer": "google",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.567,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemma-2-27b/b732e4c3-526e-42b3-8003-defe6f99dec5.json b/data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json
similarity index 78%
rename from data/helm_mmlu/google/gemma-2-27b/b732e4c3-526e-42b3-8003-defe6f99dec5.json
rename to data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json
index 2a0eccbe5..8720cc062 100644
--- a/data/helm_mmlu/google/gemma-2-27b/b732e4c3-526e-42b3-8003-defe6f99dec5.json
+++ b/data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemma-2-27b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemma-2-27b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemma 2 (27B)",
+    "name": "Gemma 2 27B",
     "id": "google/gemma-2-27b",
     "developer": "google",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.05,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemma-2-9b/72c70a52-df3d-48b4-bd2d-3161f1a4cf6b.json b/data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json
similarity index 78%
rename from data/helm_mmlu/google/gemma-2-9b/72c70a52-df3d-48b4-bd2d-3161f1a4cf6b.json
rename to data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json
index 7b83a32f9..2007b06df 100644
--- a/data/helm_mmlu/google/gemma-2-9b/72c70a52-df3d-48b4-bd2d-3161f1a4cf6b.json
+++ b/data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemma-2-9b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemma-2-9b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemma 2 (9B)",
+    "name": "Gemma 2 9B",
     "id": "google/gemma-2-9b",
     "developer": "google",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.265,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemma-7b/11b66d50-28d9-42bc-8f91-463b02fa96f7.json b/data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json
similarity index 78%
rename from data/helm_mmlu/google/gemma-7b/11b66d50-28d9-42bc-8f91-463b02fa96f7.json
rename to data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json
index 1480d9d56..963d13c9a 100644
--- a/data/helm_mmlu/google/gemma-7b/11b66d50-28d9-42bc-8f91-463b02fa96f7.json
+++ b/data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemma-7b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemma-7b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemma (7B)",
+    "name": "Gemma 7B",
     "id": "google/gemma-7b",
     "developer": "google",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.824,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/text-bison@001/70210df9-1fb2-4fdd-b6eb-0d0aec88992e.json b/data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json
similarity index 78%
rename from data/helm_mmlu/google/text-bison@001/70210df9-1fb2-4fdd-b6eb-0d0aec88992e.json
rename to data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json
index a20b853b7..c0271bcb3 100644
--- a/data/helm_mmlu/google/text-bison@001/70210df9-1fb2-4fdd-b6eb-0d0aec88992e.json
+++ b/data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_text-bison@001/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_text-bison@001/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "PaLM-2 (Bison)",
+    "name": "PaLM-2 Bison",
     "id": "google/text-bison@001",
     "developer": "google",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.192,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/text-unicorn@001/c2e53d3a-b85c-4888-8b20-225db39301ab.json b/data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json
similarity index 78%
rename from data/helm_mmlu/google/text-unicorn@001/c2e53d3a-b85c-4888-8b20-225db39301ab.json
rename to data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json
index 061cfda40..42c5040aa 100644
--- a/data/helm_mmlu/google/text-unicorn@001/c2e53d3a-b85c-4888-8b20-225db39301ab.json
+++ b/data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_text-unicorn@001/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_text-unicorn@001/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "PaLM-2 (Unicorn)",
+    "name": "PaLM-2 Unicorn",
     "id": "google/text-unicorn@001",
     "developer": "google",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.142,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-2-13b/a477c332-b082-4ad5-8d2f-905690e9d211.json b/data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json
similarity index 78%
rename from data/helm_mmlu/meta/llama-2-13b/a477c332-b082-4ad5-8d2f-905690e9d211.json
rename to data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json
index 999bc7bce..453cd8b3a 100644
--- a/data/helm_mmlu/meta/llama-2-13b/a477c332-b082-4ad5-8d2f-905690e9d211.json
+++ b/data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-2-13b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-2-13b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 2 (13B)",
+    "name": "Llama 2 13B",
     "id": "meta/llama-2-13b",
     "developer": "meta",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.502,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-2-70b/ba574f5e-cc59-4994-a595-e6472c032fc4.json b/data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json
similarity index 78%
rename from data/helm_mmlu/meta/llama-2-70b/ba574f5e-cc59-4994-a595-e6472c032fc4.json
rename to data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json
index 2bd647ad6..aa6a9caa2 100644
--- a/data/helm_mmlu/meta/llama-2-70b/ba574f5e-cc59-4994-a595-e6472c032fc4.json
+++ b/data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-2-70b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-2-70b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 2 (70B)",
+    "name": "Llama 2 70B",
     "id": "meta/llama-2-70b",
     "developer": "meta",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.508,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-2-7b/9cfa7f91-bfd0-4f02-988c-1978df8db303.json b/data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json
similarity index 78%
rename from data/helm_mmlu/meta/llama-2-7b/9cfa7f91-bfd0-4f02-988c-1978df8db303.json
rename to data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json
index f7641555c..0649e7329 100644
--- a/data/helm_mmlu/meta/llama-2-7b/9cfa7f91-bfd0-4f02-988c-1978df8db303.json
+++ b/data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-2-7b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-2-7b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 2 (7B)",
+    "name": "Llama 2 7B",
     "id": "meta/llama-2-7b",
     "developer": "meta",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.681,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3-70b/607a4b9b-3442-4690-b116-a927c6822fb3.json b/data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json
similarity index 78%
rename from data/helm_mmlu/meta/llama-3-70b/607a4b9b-3442-4690-b116-a927c6822fb3.json
rename to data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json
index 028924f0a..4f09a5ee3 100644
--- a/data/helm_mmlu/meta/llama-3-70b/607a4b9b-3442-4690-b116-a927c6822fb3.json
+++ b/data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3-70b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-3-70b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3 (70B)",
+    "name": "Llama 3 70B",
     "id": "meta/llama-3-70b",
     "developer": "meta",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.524,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3-8b/44decfe6-57ed-4677-a859-4fe5ae25b237.json b/data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json
similarity index 78%
rename from data/helm_mmlu/meta/llama-3-8b/44decfe6-57ed-4677-a859-4fe5ae25b237.json
rename to data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json
index 493305a26..83f907e80 100644
--- a/data/helm_mmlu/meta/llama-3-8b/44decfe6-57ed-4677-a859-4fe5ae25b237.json
+++ b/data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3-8b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-3-8b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3 (8B)",
+    "name": "Llama 3 8B",
     "id": "meta/llama-3-8b",
     "developer": "meta",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.733,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/af78c3b5-5d91-431d-85ac-783b5a324723.json b/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json
similarity index 78%
rename from data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/af78c3b5-5d91-431d-85ac-783b5a324723.json
rename to data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json
index 5e68e1b5a..c4ce37e9d 100644
--- a/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/af78c3b5-5d91-431d-85ac-783b5a324723.json
+++ b/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.1-405b-instruct-turbo/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-3.1-405b-instruct-turbo/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (405B)",
+    "name": "Llama 3.1 Instruct Turbo 405B",
     "id": "meta/llama-3.1-405b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.33,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/1224cee0-22f8-41b0-a7da-8a6100001a3e.json b/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json
similarity index 78%
rename from data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/1224cee0-22f8-41b0-a7da-8a6100001a3e.json
rename to data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json
index 7f880e52b..0e4b849f9 100644
--- a/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/1224cee0-22f8-41b0-a7da-8a6100001a3e.json
+++ b/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.1-70b-instruct-turbo/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-3.1-70b-instruct-turbo/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (70B)",
+    "name": "Llama 3.1 Instruct Turbo 70B",
     "id": "meta/llama-3.1-70b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.021,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/2cb2551b-dbca-46d9-a19a-165d1ac60dee.json b/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json
similarity index 78%
rename from data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/2cb2551b-dbca-46d9-a19a-165d1ac60dee.json
rename to data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json
index bdc0510b6..6c1d661d4 100644
--- a/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/2cb2551b-dbca-46d9-a19a-165d1ac60dee.json
+++ b/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (8B)",
+    "name": "Llama 3.1 Instruct Turbo 8B",
     "id": "meta/llama-3.1-8b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.475,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/3c53ce3d-4ee8-483c-be9f-964395103289.json b/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json
similarity index 78%
rename from data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/3c53ce3d-4ee8-483c-be9f-964395103289.json
rename to data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json
index e9ec2f904..599cd6855 100644
--- a/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/3c53ce3d-4ee8-483c-be9f-964395103289.json
+++ b/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.2-11b-vision-instruct-turbo/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-3.2-11b-vision-instruct-turbo/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.2 Vision Instruct Turbo (11B)",
+    "name": "Llama 3.2 Vision Instruct Turbo 11B",
     "id": "meta/llama-3.2-11b-vision-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.897,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/11e364be-39e9-4b42-97d7-ab771f17973c.json b/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json
similarity index 78%
rename from data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/11e364be-39e9-4b42-97d7-ab771f17973c.json
rename to data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json
index 51cb25f1e..f14700c78 100644
--- a/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/11e364be-39e9-4b42-97d7-ab771f17973c.json
+++ b/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.2 Vision Instruct Turbo (90B)",
+    "name": "Llama 3.2 Vision Instruct Turbo 90B",
     "id": "meta/llama-3.2-90b-vision-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.773,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/bbcf8f14-600c-4c93-b63d-64aabcab23a3.json b/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json
similarity index 78%
rename from data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/bbcf8f14-600c-4c93-b63d-64aabcab23a3.json
rename to data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json
index 124028675..faf8ae128 100644
--- a/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/bbcf8f14-600c-4c93-b63d-64aabcab23a3.json
+++ b/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.3-70b-instruct-turbo/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-3.3-70b-instruct-turbo/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.3 Instruct Turbo (70B)",
+    "name": "Llama 3.3 Instruct Turbo 70B",
     "id": "meta/llama-3.3-70b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.722,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/microsoft/phi-2/91bf0cf5-2010-4226-8b3e-d6ca019ce5b3.json b/data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json
similarity index 78%
rename from data/helm_mmlu/microsoft/phi-2/91bf0cf5-2010-4226-8b3e-d6ca019ce5b3.json
rename to data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json
index f3162d0fe..95bd9f1b8 100644
--- a/data/helm_mmlu/microsoft/phi-2/91bf0cf5-2010-4226-8b3e-d6ca019ce5b3.json
+++ b/data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/microsoft_phi-2/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/microsoft_phi-2/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -19,9 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.824,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/e58fb5ca-803c-4ac8-b392-1b9c9c8bb065.json b/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json
similarity index 78%
rename from data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/e58fb5ca-803c-4ac8-b392-1b9c9c8bb065.json
rename to data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json
index 97f9c3c96..f1d62a268 100644
--- a/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/e58fb5ca-803c-4ac8-b392-1b9c9c8bb065.json
+++ b/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/microsoft_phi-3-medium-4k-instruct/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/microsoft_phi-3-medium-4k-instruct/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Phi-3 (14B)",
+    "name": "Phi-3 14B",
     "id": "microsoft/phi-3-medium-4k-instruct",
     "developer": "microsoft",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.015,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/16c66bdf-dda3-4b12-b38c-73abee6a702f.json b/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json
similarity index 78%
rename from data/helm_mmlu/microsoft/phi-3-small-8k-instruct/16c66bdf-dda3-4b12-b38c-73abee6a702f.json
rename to data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json
index 9da3cad91..bbe3afca0 100644
--- a/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/16c66bdf-dda3-4b12-b38c-73abee6a702f.json
+++ b/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/microsoft_phi-3-small-8k-instruct/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/microsoft_phi-3-small-8k-instruct/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Phi-3 (7B)",
+    "name": "Phi-3 7B",
     "id": "microsoft/phi-3-small-8k-instruct",
     "developer": "microsoft",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.708,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/d0783259-681a-438f-b7dc-1c625a0be8ba.json b/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json
similarity index 78%
rename from data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/d0783259-681a-438f-b7dc-1c625a0be8ba.json
rename to data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json
index 2592b75a7..e788149e1 100644
--- a/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/d0783259-681a-438f-b7dc-1c625a0be8ba.json
+++ b/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Instruct v0.3 (7B)",
+    "name": "Mistral Instruct v0.3 7B",
     "id": "mistralai/mistral-7b-instruct-v0.3",
     "developer": "mistralai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.509,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mistral-7b-v0.1/a05ce725-cdf0-4fe3-88b9-8631229e4443.json b/data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json
similarity index 78%
rename from data/helm_mmlu/mistralai/mistral-7b-v0.1/a05ce725-cdf0-4fe3-88b9-8631229e4443.json
rename to data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json
index 77ee3f1a1..5ca508d3b 100644
--- a/data/helm_mmlu/mistralai/mistral-7b-v0.1/a05ce725-cdf0-4fe3-88b9-8631229e4443.json
+++ b/data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-7b-v0.1/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/mistralai_mistral-7b-v0.1/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral v0.1 (7B)",
+    "name": "Mistral v0.1 7B",
     "id": "mistralai/mistral-7b-v0.1",
     "developer": "mistralai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.213,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mistral-large-2402/0dee4200-c4f0-438e-8d0d-ca92515c6e33.json b/data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json
similarity index 78%
rename from data/helm_mmlu/mistralai/mistral-large-2402/0dee4200-c4f0-438e-8d0d-ca92515c6e33.json
rename to data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json
index c34e3e47f..6b7873124 100644
--- a/data/helm_mmlu/mistralai/mistral-large-2402/0dee4200-c4f0-438e-8d0d-ca92515c6e33.json
+++ b/data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-large-2402/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/mistralai_mistral-large-2402/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Large (2402)",
+    "name": "Mistral Large 2402",
     "id": "mistralai/mistral-large-2402",
     "developer": "mistralai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.464,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mistral-large-2407/2869d585-567d-4ddc-ac38-3e036061b13e.json b/data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json
similarity index 78%
rename from data/helm_mmlu/mistralai/mistral-large-2407/2869d585-567d-4ddc-ac38-3e036061b13e.json
rename to data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json
index 4e005a631..58aa6a379 100644
--- a/data/helm_mmlu/mistralai/mistral-large-2407/2869d585-567d-4ddc-ac38-3e036061b13e.json
+++ b/data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-large-2407/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/mistralai_mistral-large-2407/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Large 2 (2407)",
+    "name": "Mistral Large 2 2407",
     "id": "mistralai/mistral-large-2407",
     "developer": "mistralai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.24,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mistral-small-2402/d277cca3-64da-4e4b-9210-3f5b910c975c.json b/data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json
similarity index 78%
rename from data/helm_mmlu/mistralai/mistral-small-2402/d277cca3-64da-4e4b-9210-3f5b910c975c.json
rename to data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json
index ddc506063..457d9ed2a 100644
--- a/data/helm_mmlu/mistralai/mistral-small-2402/d277cca3-64da-4e4b-9210-3f5b910c975c.json
+++ b/data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-small-2402/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/mistralai_mistral-small-2402/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Small (2402)",
+    "name": "Mistral Small 2402",
     "id": "mistralai/mistral-small-2402",
     "developer": "mistralai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.54,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mixtral-8x22b/cebd1e82-0053-4541-bdf4-5a4fa0736a8a.json b/data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json
similarity index 78%
rename from data/helm_mmlu/mistralai/mixtral-8x22b/cebd1e82-0053-4541-bdf4-5a4fa0736a8a.json
rename to data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json
index 35cc50f7b..c7ab33c35 100644
--- a/data/helm_mmlu/mistralai/mixtral-8x22b/cebd1e82-0053-4541-bdf4-5a4fa0736a8a.json
+++ b/data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/mistralai_mixtral-8x22b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/mistralai_mixtral-8x22b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mixtral (8x22B)",
+    "name": "Mixtral 8x22B",
     "id": "mistralai/mixtral-8x22b",
     "developer": "mistralai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.598,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/0f6762ed-e462-4ce7-86ea-dfc3a634d97c.json b/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json
similarity index 78%
rename from data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/0f6762ed-e462-4ce7-86ea-dfc3a634d97c.json
rename to data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json
index 247f8572e..3ed7c6104 100644
--- a/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/0f6762ed-e462-4ce7-86ea-dfc3a634d97c.json
+++ b/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/mistralai_mixtral-8x7b-32kseqlen/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/mistralai_mixtral-8x7b-32kseqlen/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mixtral (8x7B 32K seqlen)",
+    "name": "Mixtral 8x7B 32K seqlen",
     "id": "mistralai/mixtral-8x7b-32kseqlen",
     "developer": "mistralai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.689,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/open-mistral-nemo-2407/87bd4fa2-0c5c-4b6a-8386-e84f1cdd9066.json b/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json
similarity index 78%
rename from data/helm_mmlu/mistralai/open-mistral-nemo-2407/87bd4fa2-0c5c-4b6a-8386-e84f1cdd9066.json
rename to data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json
index 20e5d8bc5..e5aec6b67 100644
--- a/data/helm_mmlu/mistralai/open-mistral-nemo-2407/87bd4fa2-0c5c-4b6a-8386-e84f1cdd9066.json
+++ b/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/mistralai_open-mistral-nemo-2407/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/mistralai_open-mistral-nemo-2407/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral NeMo (2402)",
+    "name": "Mistral NeMo 2402",
     "id": "mistralai/open-mistral-nemo-2407",
     "developer": "mistralai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.215,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-3.5-turbo-0125/48a0dd6b-9304-460a-8e4e-420c60dfa854.json b/data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json
similarity index 78%
rename from data/helm_mmlu/openai/gpt-3.5-turbo-0125/48a0dd6b-9304-460a-8e4e-420c60dfa854.json
rename to data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json
index 61bdc2a92..e429d6dbc 100644
--- a/data/helm_mmlu/openai/gpt-3.5-turbo-0125/48a0dd6b-9304-460a-8e4e-420c60dfa854.json
+++ b/data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0125/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0125/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-3.5 Turbo (0125)",
+    "name": "GPT-3.5 Turbo 0125",
     "id": "openai/gpt-3.5-turbo-0125",
     "developer": "openai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.493,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-3.5-turbo-0613/1e1140d0-4dc9-4bb7-9560-6c9be1cbda29.json b/data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json
similarity index 78%
rename from data/helm_mmlu/openai/gpt-3.5-turbo-0613/1e1140d0-4dc9-4bb7-9560-6c9be1cbda29.json
rename to data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json
index a7037b692..92faf2169 100644
--- a/data/helm_mmlu/openai/gpt-3.5-turbo-0613/1e1140d0-4dc9-4bb7-9560-6c9be1cbda29.json
+++ b/data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0613/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0613/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-3.5 Turbo (0613)",
+    "name": "GPT-3.5 Turbo 0613",
     "id": "openai/gpt-3.5-turbo-0613",
     "developer": "openai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.589,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4-0613/8c587ab3-8a32-4cb1-aa67-63c2fb2b929f.json b/data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json
similarity index 78%
rename from data/helm_mmlu/openai/gpt-4-0613/8c587ab3-8a32-4cb1-aa67-63c2fb2b929f.json
rename to data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json
index 8a368f8b6..6ccc418f3 100644
--- a/data/helm_mmlu/openai/gpt-4-0613/8c587ab3-8a32-4cb1-aa67-63c2fb2b929f.json
+++ b/data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4 (0613)",
+    "name": "GPT-4 0613",
     "id": "openai/gpt-4-0613",
     "developer": "openai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.517,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4-1106-preview/174ad35c-d6b5-49bd-930c-9c83608213a9.json b/data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json
similarity index 78%
rename from data/helm_mmlu/openai/gpt-4-1106-preview/174ad35c-d6b5-49bd-930c-9c83608213a9.json
rename to data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json
index 41438331c..610be9719 100644
--- a/data/helm_mmlu/openai/gpt-4-1106-preview/174ad35c-d6b5-49bd-930c-9c83608213a9.json
+++ b/data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4-1106-preview/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/openai_gpt-4-1106-preview/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4 Turbo (1106 preview)",
+    "name": "GPT-4 Turbo 1106 preview",
     "id": "openai/gpt-4-1106-preview",
     "developer": "openai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.416,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/348bbc24-09de-4d1e-98bc-079e87fea558.json b/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json
similarity index 78%
rename from data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/348bbc24-09de-4d1e-98bc-079e87fea558.json
rename to data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json
index a7796e764..a348a9fb9 100644
--- a/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/348bbc24-09de-4d1e-98bc-079e87fea558.json
+++ b/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4 Turbo (2024-04-09)",
+    "name": "GPT-4 Turbo 2024-04-09",
     "id": "openai/gpt-4-turbo-2024-04-09",
     "developer": "openai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.351,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4o-2024-05-13/f37fc452-58f2-4d80-a71c-9331f7fe549e.json b/data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json
similarity index 78%
rename from data/helm_mmlu/openai/gpt-4o-2024-05-13/f37fc452-58f2-4d80-a71c-9331f7fe549e.json
rename to data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json
index 1572c27c7..76ba53d53 100644
--- a/data/helm_mmlu/openai/gpt-4o-2024-05-13/f37fc452-58f2-4d80-a71c-9331f7fe549e.json
+++ b/data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-05-13/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-05-13/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4o (2024-05-13)",
+    "name": "GPT-4o 2024-05-13",
     "id": "openai/gpt-4o-2024-05-13",
     "developer": "openai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.671,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4o-2024-08-06/71df45d2-1a27-4ff2-853c-e853f809ff52.json b/data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json
similarity index 78%
rename from data/helm_mmlu/openai/gpt-4o-2024-08-06/71df45d2-1a27-4ff2-853c-e853f809ff52.json
rename to data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json
index 4ba84b207..2d538eb02 100644
--- a/data/helm_mmlu/openai/gpt-4o-2024-08-06/71df45d2-1a27-4ff2-853c-e853f809ff52.json
+++ b/data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-08-06/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-08-06/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4o (2024-08-06)",
+    "name": "GPT-4o 2024-08-06",
     "id": "openai/gpt-4o-2024-08-06",
     "developer": "openai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.52,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/7c049135-a8bc-46ca-9a85-cba23e8696fd.json b/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json
similarity index 78%
rename from data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/7c049135-a8bc-46ca-9a85-cba23e8696fd.json
rename to data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json
index f69b1b3d4..7753003a8 100644
--- a/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/7c049135-a8bc-46ca-9a85-cba23e8696fd.json
+++ b/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4o-mini-2024-07-18/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/openai_gpt-4o-mini-2024-07-18/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4o mini (2024-07-18)",
+    "name": "GPT-4o mini 2024-07-18",
     "id": "openai/gpt-4o-mini-2024-07-18",
     "developer": "openai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.774,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen1.5-110b-chat/69737d19-682b-494f-b10b-fb788e83076b.json b/data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json
similarity index 78%
rename from data/helm_mmlu/qwen/qwen1.5-110b-chat/69737d19-682b-494f-b10b-fb788e83076b.json
rename to data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json
index 190b1dce2..4b924f5af 100644
--- a/data/helm_mmlu/qwen/qwen1.5-110b-chat/69737d19-682b-494f-b10b-fb788e83076b.json
+++ b/data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-110b-chat/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/qwen_qwen1.5-110b-chat/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 Chat (110B)",
+    "name": "Qwen1.5 Chat 110B",
     "id": "qwen/qwen1.5-110b-chat",
     "developer": "qwen",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.875,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen1.5-14b/c8de5fb0-5b1b-482f-b34a-d85e22e61bb9.json b/data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json
similarity index 78%
rename from data/helm_mmlu/qwen/qwen1.5-14b/c8de5fb0-5b1b-482f-b34a-d85e22e61bb9.json
rename to data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json
index 7ff151a72..9bfc87f91 100644
--- a/data/helm_mmlu/qwen/qwen1.5-14b/c8de5fb0-5b1b-482f-b34a-d85e22e61bb9.json
+++ b/data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-14b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/qwen_qwen1.5-14b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 (14B)",
+    "name": "Qwen1.5 14B",
     "id": "qwen/qwen1.5-14b",
     "developer": "qwen",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.796,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen1.5-32b/ed668c03-e5df-4871-b2fa-876b2cda62f3.json b/data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json
similarity index 78%
rename from data/helm_mmlu/qwen/qwen1.5-32b/ed668c03-e5df-4871-b2fa-876b2cda62f3.json
rename to data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json
index 421333da5..d1a9f19e1 100644
--- a/data/helm_mmlu/qwen/qwen1.5-32b/ed668c03-e5df-4871-b2fa-876b2cda62f3.json
+++ b/data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-32b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/qwen_qwen1.5-32b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 (32B)",
+    "name": "Qwen1.5 32B",
     "id": "qwen/qwen1.5-32b",
     "developer": "qwen",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.624,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen1.5-72b/c504b47e-e4eb-4d5e-a01a-7c2b4fd32757.json b/data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json
similarity index 78%
rename from data/helm_mmlu/qwen/qwen1.5-72b/c504b47e-e4eb-4d5e-a01a-7c2b4fd32757.json
rename to data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json
index d14327eec..94c5e4e80 100644
--- a/data/helm_mmlu/qwen/qwen1.5-72b/c504b47e-e4eb-4d5e-a01a-7c2b4fd32757.json
+++ b/data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-72b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/qwen_qwen1.5-72b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 (72B)",
+    "name": "Qwen1.5 72B",
     "id": "qwen/qwen1.5-72b",
     "developer": "qwen",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.65,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen1.5-7b/1c743b00-0ca6-4332-9bb6-7f62190d74e3.json b/data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json
similarity index 78%
rename from data/helm_mmlu/qwen/qwen1.5-7b/1c743b00-0ca6-4332-9bb6-7f62190d74e3.json
rename to data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json
index d9688a597..166da7894 100644
--- a/data/helm_mmlu/qwen/qwen1.5-7b/1c743b00-0ca6-4332-9bb6-7f62190d74e3.json
+++ b/data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-7b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/qwen_qwen1.5-7b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 (7B)",
+    "name": "Qwen1.5 7B",
     "id": "qwen/qwen1.5-7b",
     "developer": "qwen",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.843,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen2-72b-instruct/7f9317d3-b2bc-481d-9b28-9f305612ac58.json b/data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json
similarity index 78%
rename from data/helm_mmlu/qwen/qwen2-72b-instruct/7f9317d3-b2bc-481d-9b28-9f305612ac58.json
rename to data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json
index abb62e63a..6f8b955e0 100644
--- a/data/helm_mmlu/qwen/qwen2-72b-instruct/7f9317d3-b2bc-481d-9b28-9f305612ac58.json
+++ b/data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen2-72b-instruct/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/qwen_qwen2-72b-instruct/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen2 Instruct (72B)",
+    "name": "Qwen2 Instruct 72B",
     "id": "qwen/qwen2-72b-instruct",
     "developer": "qwen",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.826,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/7b3bc40a-a606-419d-b784-99697c1df5bc.json b/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json
similarity index 78%
rename from data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/7b3bc40a-a606-419d-b784-99697c1df5bc.json
rename to data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json
index ee06a7f3d..a61d620fd 100644
--- a/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/7b3bc40a-a606-419d-b784-99697c1df5bc.json
+++ b/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen2.5-72b-instruct-turbo/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/qwen_qwen2.5-72b-instruct-turbo/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen2.5 Instruct Turbo (72B)",
+    "name": "Qwen2.5 Instruct Turbo 72B",
     "id": "qwen/qwen2.5-72b-instruct-turbo",
     "developer": "qwen",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.548,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/d7ac28f2-4c6d-44d9-9b87-b264df69a0cc.json b/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json
similarity index 78%
rename from data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/d7ac28f2-4c6d-44d9-9b87-b264df69a0cc.json
rename to data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json
index f8033410f..c045e519d 100644
--- a/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/d7ac28f2-4c6d-44d9-9b87-b264df69a0cc.json
+++ b/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen2.5-7b-instruct-turbo/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/qwen_qwen2.5-7b-instruct-turbo/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen2.5 Instruct Turbo (7B)",
+    "name": "Qwen2.5 Instruct Turbo 7B",
     "id": "qwen/qwen2.5-7b-instruct-turbo",
     "developer": "qwen",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.887,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/snowflake/snowflake-arctic-instruct/cc68185c-6ee2-40bd-8951-f104d898c7f8.json b/data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json
similarity index 78%
rename from data/helm_mmlu/snowflake/snowflake-arctic-instruct/cc68185c-6ee2-40bd-8951-f104d898c7f8.json
rename to data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json
index cde071792..0afa77758 100644
--- a/data/helm_mmlu/snowflake/snowflake-arctic-instruct/cc68185c-6ee2-40bd-8951-f104d898c7f8.json
+++ b/data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -19,9 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.565,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/upstage/solar-pro-241126/78ddc5dc-3f25-4ff6-96a1-b9b677d22f51.json b/data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json
similarity index 78%
rename from data/helm_mmlu/upstage/solar-pro-241126/78ddc5dc-3f25-4ff6-96a1-b9b677d22f51.json
rename to data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json
index 7d7fe6a40..2c0cfc48a 100644
--- a/data/helm_mmlu/upstage/solar-pro-241126/78ddc5dc-3f25-4ff6-96a1-b9b677d22f51.json
+++ b/data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/upstage_solar-pro-241126/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/upstage_solar-pro-241126/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -19,9 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.462,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/writer/palmyra-x-004/ba74f375-fd6d-4bba-af63-605bd73c9b7f.json b/data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json
similarity index 78%
rename from data/helm_mmlu/writer/palmyra-x-004/ba74f375-fd6d-4bba-af63-605bd73c9b7f.json
rename to data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json
index c2c0d493b..c204b253d 100644
--- a/data/helm_mmlu/writer/palmyra-x-004/ba74f375-fd6d-4bba-af63-605bd73c9b7f.json
+++ b/data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/writer_palmyra-x-004/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/writer_palmyra-x-004/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -19,9 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.629,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/writer/palmyra-x-v3/41bf95f4-3c5b-4b33-ba3b-63ca32ae067f.json b/data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json
similarity index 78%
rename from data/helm_mmlu/writer/palmyra-x-v3/41bf95f4-3c5b-4b33-ba3b-63ca32ae067f.json
rename to data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json
index fd6405aa5..2eef769c8 100644
--- a/data/helm_mmlu/writer/palmyra-x-v3/41bf95f4-3c5b-4b33-ba3b-63ca32ae067f.json
+++ b/data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/writer_palmyra-x-v3/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/writer_palmyra-x-v3/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,16 +9,23 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Palmyra X V3 (72B)",
+    "name": "Palmyra X V3 72B",
     "id": "writer/palmyra-x-v3",
     "developer": "writer",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -65,132 +69,141 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -234,16 +247,25 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -287,16 +309,25 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -490,16 +521,25 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -543,16 +583,25 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -596,16 +645,25 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -649,16 +707,25 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -702,16 +769,25 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -755,16 +831,25 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -898,16 +983,25 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -951,16 +1045,25 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1004,16 +1107,25 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1057,16 +1169,25 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1110,16 +1231,25 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1163,16 +1293,25 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1216,16 +1355,25 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1269,16 +1417,25 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1322,16 +1479,25 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1765,16 +1931,25 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1848,16 +2023,25 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1901,16 +2085,25 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1954,16 +2147,25 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2007,16 +2209,25 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2060,16 +2271,25 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2113,16 +2333,25 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2166,16 +2395,25 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2219,16 +2457,25 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2302,16 +2549,25 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2355,16 +2611,25 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2408,16 +2673,25 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2461,16 +2735,25 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2514,16 +2797,25 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2567,16 +2859,25 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2921,25 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.325,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/scripts/HELM/parse_helm_leaderboards.sh b/scripts/HELM/parse_helm_leaderboards.sh
new file mode 100755
index 000000000..a89a1a64e
--- /dev/null
+++ b/scripts/HELM/parse_helm_leaderboards.sh
@@ -0,0 +1,9 @@
+uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_Capabilities --source_data_url https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json
+
+uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_Lite --source_data_url https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json
+
+uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_Classic --source_data_url https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json 
+
+uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_Instruct --source_data_url https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json
+
+uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_MMLU --source_data_url https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json
\ No newline at end of file
diff --git a/utils/helm/adapter.py b/utils/helm/adapter.py
index 3297cfac9..a3a7aca96 100644
--- a/utils/helm/adapter.py
+++ b/utils/helm/adapter.py
@@ -22,10 +22,12 @@
     EvaluationLog,
     EvaluationResult,
     EvaluatorRelationship,
+    GenerationConfig,
     MetricConfig,
     ModelInfo,
     ScoreDetails,
     ScoreType,
+    SourceDataUrl
 )
 
 import sys
@@ -114,7 +116,7 @@ def extract_model_info_from_row(row: List[Dict[str, Any]], model_name: str) -> T
     else:
         spec = run_spec_names[0]
         args = spec.split(":", 1)[1].split(",")
-
+        
         model_details = next(
             (arg.split("=", 1)[1] for arg in args if arg.startswith("model=")),
             "",
@@ -126,12 +128,14 @@ def extract_model_info_from_row(row: List[Dict[str, Any]], model_name: str) -> T
     if developer == "unknown":
         developer = get_developer(model_name)
 
-    return make_model_info(
+    model_info = make_model_info(
         model_name=model_name,
         developer=developer,
         inference_platform="unknown",
-    ), model_id
+    )
+    model_info.id = model_id
 
+    return model_info
 
 def find_column_ranges(tab_rows: List[List[Dict[str, Any]]]):
     """Determine min/max values for each metric column."""
@@ -152,7 +156,6 @@ def find_column_ranges(tab_rows: List[List[Dict[str, Any]]]):
 def convert(
     leaderboard_name: str,
     leaderboard_data: List[Dict[str, Any]],
-    source_data: List[str],
 ):
     """Convert HELM leaderboard data into unified evaluation logs."""
     retrieved_timestamp = str(time.time())
@@ -172,9 +175,9 @@ def convert(
             model_name = row[0].get("value")
 
             if model_name not in model_infos:
-                model_info, model_id = extract_model_info_from_row(row, model_name)
+                model_info = extract_model_info_from_row(row, model_name)
                 model_infos[model_name] = model_info
-                model_ids[model_name] = model_id
+                model_ids[model_name] = model_info.id
 
             for col_idx, (header, cell) in enumerate(zip(headers[1:], row[1:])):
                 full_eval_name = header.get("value")
@@ -190,9 +193,22 @@ def convert(
                     or "instruct" in leaderboard_name.lower()
                 )
 
+                if full_eval_name.lower().startswith('mean'):
+                    metric_name = None
+                    dataset_name = leaderboard_name
+                    evaluation_name = full_eval_name
+                else:
+                    dataset_name, metric_name = full_eval_name.split(' - ', 1)
+                    evaluation_name = dataset_name
+
+                if metric_name:
+                    evaluation_description = f'{metric_name} on {dataset_name}'
+                else:
+                    evaluation_description = header.get("description")
+
                 if is_new_metric:
                     metric_config = MetricConfig(
-                        evaluation_description=header.get("description"),
+                        evaluation_description=evaluation_description,
                         lower_is_better=header.get("lower_is_better", False),
                         min_score=(
                             0.0 if mins[col_idx] >= 0 else math.floor(mins[col_idx])
@@ -203,6 +219,14 @@ def convert(
                         score_type=ScoreType.continuous,
                     )
 
+                    source_dataset_name = leaderboard_name if leaderboard_name.lower() == 'helm_mmlu' else dataset_name
+
+                    source_data = SourceDataUrl(
+                        dataset_name=source_dataset_name,
+                        source_type='url',
+                        url=[args.source_data_url]
+                    )
+
                     generation_config = (
                         extract_generation_config(cell.get("run_spec_names", []))
                         if cell.get("run_spec_names")
@@ -210,7 +234,8 @@ def convert(
                     )
 
                     model_results[model_name][short_name] = EvaluationResult(
-                        evaluation_name=full_eval_name,
+                        evaluation_name=evaluation_name,
+                        source_data=source_data,
                         metric_config=metric_config,
                         score_details=ScoreDetails(
                             score=round(cell.get("value"), 3)
@@ -221,7 +246,9 @@ def convert(
                                 "tab": tab_name,
                             },
                         ),
-                        generation_config=generation_config,
+                        generation_config=GenerationConfig(
+                            additional_details=generation_config
+                        )
                     )
                 else:
                     # Add extra score details under the same metric
@@ -232,12 +259,16 @@ def convert(
                         else f"{full_eval_name} - {tab_name}"
                     )
 
-                    existing.score_details.details[detail_key] = {
-                        "description": cell.get("description"),
-                        "tab": tab_name,
-                        "score": cell.get("value"),
-                    }
-
+                    setattr(
+                        existing.score_details.details,
+                        detail_key,
+                        {
+                            "description": cell.get("description"),
+                            "tab": tab_name,
+                            "score": cell.get("value"),
+                        }
+                    )
+                
     # Save evaluation logs
     for model_name, results_by_metric in model_results.items():
         model_info = model_infos[model_name]
@@ -250,7 +281,7 @@ def convert(
         )
 
         eval_log = EvaluationLog(
-            schema_version="0.1.0",
+            schema_version="0.2.0",
             evaluation_id=evaluation_id,
             retrieved_timestamp=retrieved_timestamp,
             source_metadata=make_source_metadata(
@@ -259,7 +290,6 @@ def convert(
                 evaluator_relationship=EvaluatorRelationship.third_party,
             ),
             model_info=model_info,
-            source_data=source_data,
             evaluation_results=list(results_by_metric.values()),
         )
 
@@ -287,15 +317,13 @@ def convert(
     args = parse_args()
 
     leaderboard_name = args.leaderboard_name.lower()
-    source_data = [args.source_data_url]
 
     print(f"Fetching {leaderboard_name} data from {args.source_data_url}")
-    leaderboard_data = fetch_json(source_data[0])
+    leaderboard_data = fetch_json(args.source_data_url)
 
     convert(
         leaderboard_name=leaderboard_name,
-        leaderboard_data=leaderboard_data,
-        source_data=source_data,
+        leaderboard_data=leaderboard_data
     )
 
     print("Done!")